import quandl
import pandas as pd
import numpy as np
import datetime as dt
import pandas_profiling
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
# import libraries here; add more as necessary
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split
from collections import defaultdict
# Import supplementary visualization code visuals.py
#import visuals as vs
from numpy import concatenate
# magic word for producing visualizations in notebook.allow plots to appear directly in the notebook
%matplotlib inline
quandl.ApiConfig.api_key = "v5Bazu_S389s29HiutZh"
#Get data from Quandl APIs into dataframes
SP500_DIV_YIELD_MONTH = quandl.get('MULTPL/SP500_DIV_YIELD_MONTH') #MULTPLkeys[0]
SP500_PE_RATIO_MONTH = quandl.get('MULTPL/SP500_PE_RATIO_MONTH')
SHILLER_PE_RATIO_MONTH = quandl.get('MULTPL/SHILLER_PE_RATIO_MONTH')
SP500_EARNINGS_YIELD_MONTH = quandl.get('MULTPL/SP500_EARNINGS_YIELD_MONTH')
SP500_INFLADJ_MONTH = quandl.get('MULTPL/SP500_INFLADJ_MONTH')#MULTPLkeys[4]
SP500_PSR_QUARTER = quandl.get('MULTPL/SP500_PSR_QUARTER')
SP500_DIV_MONTH = quandl.get('MULTPL/SP500_DIV_MONTH')
SP500_DIV_YEAR = quandl.get('MULTPL/SP500_DIV_YEAR')
SP500_DIV_GROWTH_YEAR = quandl.get('MULTPL/SP500_DIV_GROWTH_YEAR')
SP500_DIV_GROWTH_QUARTER = quandl.get('MULTPL/SP500_DIV_GROWTH_QUARTER')
SP500_PBV_RATIO_QUARTER = quandl.get('MULTPL/SP500_PBV_RATIO_QUARTER') #MULTPLkeys[10]
SHILLER_PE_RATIO_YEAR = quandl.get('MULTPL/SHILLER_PE_RATIO_YEAR')
SP500_PE_RATIO_YEAR = quandl.get('MULTPL/SP500_PE_RATIO_YEAR')
SP500_DIV_YIELD_YEAR = quandl.get('MULTPL/SP500_DIV_YIELD_YEAR')
SP500_PSR_YEAR = quandl.get('MULTPL/SP500_PSR_YEAR')
SP500_EARNINGS_YIELD_YEAR = quandl.get('MULTPL/SP500_EARNINGS_YIELD_YEAR')
SP500_PBV_RATIO_YEAR = quandl.get('MULTPL/SP500_PBV_RATIO_YEAR')
SP500_INFLADJ_YEAR = quandl.get('MULTPL/SP500_INFLADJ_YEAR')
SP500_REAL_PRICE_MONTH = quandl.get('MULTPL/SP500_REAL_PRICE_MONTH') #MULTPLkeys[18]
SP500_SALES_YEAR = quandl.get('MULTPL/SP500_SALES_YEAR')
SP500_SALES_GROWTH_YEAR = quandl.get('MULTPL/SP500_SALES_GROWTH_YEAR') #MULTPLkeys[20]
SP500_SALES_QUARTER = quandl.get('MULTPL/SP500_SALES_QUARTER')
SP500_REAL_SALES_GROWTH_QUARTER = quandl.get('MULTPL/SP500_REAL_SALES_GROWTH_QUARTER')
SP500_SALES_GROWTH_QUARTER = quandl.get('MULTPL/SP500_SALES_GROWTH_QUARTER')
SP500_REAL_SALES_GROWTH_YEAR = quandl.get('MULTPL/SP500_REAL_SALES_GROWTH_YEAR')
SP500_REAL_EARNINGS_GROWTH_YEAR = quandl.get('MULTPL/SP500_REAL_EARNINGS_GROWTH_YEAR')
SP500_REAL_SALES_YEAR = quandl.get('MULTPL/SP500_REAL_SALES_YEAR')
SP500_REAL_EARNINGS_GROWTH_QUARTER = quandl.get('MULTPL/SP500_REAL_EARNINGS_GROWTH_QUARTER')
SP500_EARNINGS_GROWTH_QUARTER = quandl.get('MULTPL/SP500_EARNINGS_GROWTH_QUARTER')
SP500_REAL_SALES_QUARTER = quandl.get('MULTPL/SP500_REAL_SALES_QUARTER')
SP500_EARNINGS_MONTH = quandl.get('MULTPL/SP500_EARNINGS_MONTH') #MULTPLkeys[30]
SP500_BVPS_YEAR = quandl.get('MULTPL/SP500_BVPS_YEAR')
SP500_EARNINGS_YEAR = quandl.get('MULTPL/SP500_EARNINGS_YEAR')
SP500_EARNINGS_GROWTH_YEAR = quandl.get('MULTPL/SP500_EARNINGS_GROWTH_YEAR')
SP500_BVPS_QUARTER = quandl.get('MULTPL/SP500_BVPS_QUARTER')
SP500_REAL_PRICE_YEAR = quandl.get('MULTPL/SP500_REAL_PRICE_YEAR') #MULTPLkeys[35]
#API keywords list
MULTPLkeys = [
'MULTPL/SP500_DIV_YIELD_MONTH',
'MULTPL/SP500_PE_RATIO_MONTH',
'MULTPL/SHILLER_PE_RATIO_MONTH',
'MULTPL/SP500_EARNINGS_YIELD_MONTH',
'MULTPL/SP500_INFLADJ_MONTH',
'MULTPL/SP500_PSR_QUARTER',
'MULTPL/SP500_DIV_MONTH',
'MULTPL/SP500_DIV_YEAR',
'MULTPL/SP500_DIV_GROWTH_YEAR',
'MULTPL/SP500_DIV_GROWTH_QUARTER',
'MULTPL/SP500_PBV_RATIO_QUARTER',
'MULTPL/SHILLER_PE_RATIO_YEAR',
'MULTPL/SP500_PE_RATIO_YEAR',
'MULTPL/SP500_DIV_YIELD_YEAR',
'MULTPL/SP500_PSR_YEAR',
'MULTPL/SP500_EARNINGS_YIELD_YEAR',
'MULTPL/SP500_PBV_RATIO_YEAR',
'MULTPL/SP500_INFLADJ_YEAR',
'MULTPL/SP500_REAL_PRICE_MONTH',
'MULTPL/SP500_SALES_YEAR',
'MULTPL/SP500_SALES_GROWTH_YEAR',
'MULTPL/SP500_SALES_QUARTER',
'MULTPL/SP500_REAL_SALES_GROWTH_QUARTER',
'MULTPL/SP500_SALES_GROWTH_QUARTER',
'MULTPL/SP500_REAL_SALES_GROWTH_YEAR',
'MULTPL/SP500_REAL_EARNINGS_GROWTH_YEAR',
'MULTPL/SP500_REAL_SALES_YEAR',
'MULTPL/SP500_REAL_EARNINGS_GROWTH_QUARTER',
'MULTPL/SP500_EARNINGS_GROWTH_QUARTER',
'MULTPL/SP500_REAL_SALES_QUARTER',
'MULTPL/SP500_EARNINGS_MONTH',
'MULTPL/SP500_BVPS_YEAR',
'MULTPL/SP500_EARNINGS_YEAR',
'MULTPL/SP500_EARNINGS_GROWTH_YEAR',
'MULTPL/SP500_BVPS_QUARTER',
'MULTPL/SP500_REAL_PRICE_YEAR'
]
#Split API keywords to create df variable names
var_list = [i.split('/')[1] for i in MULTPLkeys]
print(var_list[0],var_list[1],var_list[2])
SP500_DIV_YIELD_MONTH SP500_PE_RATIO_MONTH SHILLER_PE_RATIO_MONTH
# def get_data(i):
# return quandl.get('MULTPL/{}'.format(i))
var_list
['SP500_DIV_YIELD_MONTH', 'SP500_PE_RATIO_MONTH', 'SHILLER_PE_RATIO_MONTH', 'SP500_EARNINGS_YIELD_MONTH', 'SP500_INFLADJ_MONTH', 'SP500_PSR_QUARTER', 'SP500_DIV_MONTH', 'SP500_DIV_YEAR', 'SP500_DIV_GROWTH_YEAR', 'SP500_DIV_GROWTH_QUARTER', 'SP500_PBV_RATIO_QUARTER', 'SHILLER_PE_RATIO_YEAR', 'SP500_PE_RATIO_YEAR', 'SP500_DIV_YIELD_YEAR', 'SP500_PSR_YEAR', 'SP500_EARNINGS_YIELD_YEAR', 'SP500_PBV_RATIO_YEAR', 'SP500_INFLADJ_YEAR', 'SP500_REAL_PRICE_MONTH', 'SP500_SALES_YEAR', 'SP500_SALES_GROWTH_YEAR', 'SP500_SALES_QUARTER', 'SP500_REAL_SALES_GROWTH_QUARTER', 'SP500_SALES_GROWTH_QUARTER', 'SP500_REAL_SALES_GROWTH_YEAR', 'SP500_REAL_EARNINGS_GROWTH_YEAR', 'SP500_REAL_SALES_YEAR', 'SP500_REAL_EARNINGS_GROWTH_QUARTER', 'SP500_EARNINGS_GROWTH_QUARTER', 'SP500_REAL_SALES_QUARTER', 'SP500_EARNINGS_MONTH', 'SP500_BVPS_YEAR', 'SP500_EARNINGS_YEAR', 'SP500_EARNINGS_GROWTH_YEAR', 'SP500_BVPS_QUARTER', 'SP500_REAL_PRICE_YEAR']
# var_list1 = [
# 'SP500_DIV_YIELD_MONTH1',
# 'SP500_PE_RATIO_MONTH2',
# 'SHILLER_PE_RATIO_MONTH']
# for i in var_list:
# #i = pd.DataFrame(get_data(i))
# exec(f'{i} = get_data(i)')
# #print(get_data(i))
# print(i)
# break
# x= 'SP500_DIV_YIELD_MONTH'
# exec("%s = %d" % (x,0))
# print(SP500_DIV_YIELD_MONTH)
# print(x)
#exec(f'{i} = get_data(i)')
# for i in range(len(var_list)//10):
# x= var_list[i]
# exec("%s = %d" % (x,0))
# #print(var_list[i])
# #print(x)
# print(SP500_DIV_YIELD_MONTH)
# # SP500_DIV_YIELD_MONTH = quandl.get(str(MULTPLkeys[0]))
# # print(SP500_DIV_YIELD_MONTH.head())
# #SP500_DIV_YIELD_MONTH
# #SP500_PE_RATIO_MONTH
# #SHILLER_PE_RATIO_MONTH
# #dynamic variables
# x= var_list[1]
# exec("%s = %s" % (x,x))
# SP500_PE_RATIO_MONTH
print(SP500_REAL_PRICE_MONTH.head())
print(SP500_EARNINGS_YIELD_MONTH.head())
Value
Date
1871-01-01 4.44
1871-02-01 4.50
1871-03-01 4.61
1871-04-01 4.74
1871-05-01 4.86
Value
Date
1871-01-01 9.01
1871-02-01 8.89
1871-03-01 8.68
1871-04-01 8.44
1871-05-01 8.23
df = pd.merge(pd.merge(SP500_REAL_PRICE_MONTH,SP500_EARNINGS_YIELD_MONTH,on='Date'),SHILLER_PE_RATIO_MONTH,on='Date') df.head()
Assumptions-
outer join between price and monthly yield df to include all dates of every month.
SP500_REAL_PRICE_MONTH is the target label hence start with this df to include all rows
SP500_DIV_YIELD_MONTH.head()
| Value | |
|---|---|
| Date | |
| 1871-01-31 | 5.86 |
| 1871-02-28 | 5.78 |
| 1871-03-31 | 5.64 |
| 1871-04-30 | 5.49 |
| 1871-05-31 | 5.35 |
#Join all month dataframes together on Date column.
df = SP500_REAL_PRICE_MONTH.join(SP500_DIV_YIELD_MONTH,on=None,how='outer',
lsuffix='_SP500_REAL_PRICE_MONTH',rsuffix='_SP500_DIV_YIELD_MONTH',sort=False)
df= df.join(SP500_PE_RATIO_MONTH,on='Date',how='left',sort=False)
df = df.merge(
SHILLER_PE_RATIO_MONTH,on='Date',how='left').merge(
SP500_EARNINGS_YIELD_MONTH,on='Date',how='left').merge(
SP500_INFLADJ_MONTH,on='Date',how='left').merge(
SP500_PSR_QUARTER,on='Date',how='left').merge(
SP500_DIV_MONTH,on='Date',how='outer').merge(
SP500_DIV_YEAR,on='Date',how='left')
df.columns = ['Value_SP500_REAL_PRICE_MONTH','Value_SP500_DIV_YIELD_MONTH','Value_SP500_PE_RATIO_MONTH',
'Value_SHILLER_PE_RATIO_MONTH','Value_SP500_EARNINGS_YIELD_MONTH','Value_SP500_INFLADJ_MONTH',
'Value_SP500_PSR_QUARTER','Value_SP500_DIV_MONTH','Value_SP500_DIV_YEAR'
]
#print(df.head(6))
print(df.columns)
print(df.shape)
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
'Value_SP500_DIV_YEAR'],
dtype='object')
(3548, 9)
def Merge__Rename_function(df,df_var,column_name):
df = df.merge(df_var,on='Date',how='outer')
Column_name = 'Value_' + str(column_name)
#print(Column_name)
df.rename({'Value':Column_name},axis=1,inplace=True)
#print(df.head(6))
print(df.shape)
#print(df.columns)
return df
var_list
['SP500_DIV_YIELD_MONTH', 'SP500_PE_RATIO_MONTH', 'SHILLER_PE_RATIO_MONTH', 'SP500_EARNINGS_YIELD_MONTH', 'SP500_INFLADJ_MONTH', 'SP500_PSR_QUARTER', 'SP500_DIV_MONTH', 'SP500_DIV_YEAR', 'SP500_DIV_GROWTH_YEAR', 'SP500_DIV_GROWTH_QUARTER', 'SP500_PBV_RATIO_QUARTER', 'SHILLER_PE_RATIO_YEAR', 'SP500_PE_RATIO_YEAR', 'SP500_DIV_YIELD_YEAR', 'SP500_PSR_YEAR', 'SP500_EARNINGS_YIELD_YEAR', 'SP500_PBV_RATIO_YEAR', 'SP500_INFLADJ_YEAR', 'SP500_REAL_PRICE_MONTH', 'SP500_SALES_YEAR', 'SP500_SALES_GROWTH_YEAR', 'SP500_SALES_QUARTER', 'SP500_REAL_SALES_GROWTH_QUARTER', 'SP500_SALES_GROWTH_QUARTER', 'SP500_REAL_SALES_GROWTH_YEAR', 'SP500_REAL_EARNINGS_GROWTH_YEAR', 'SP500_REAL_SALES_YEAR', 'SP500_REAL_EARNINGS_GROWTH_QUARTER', 'SP500_EARNINGS_GROWTH_QUARTER', 'SP500_REAL_SALES_QUARTER', 'SP500_EARNINGS_MONTH', 'SP500_BVPS_YEAR', 'SP500_EARNINGS_YEAR', 'SP500_EARNINGS_GROWTH_YEAR', 'SP500_BVPS_QUARTER', 'SP500_REAL_PRICE_YEAR']
df = Merge__Rename_function(df,SP500_DIV_GROWTH_YEAR,'SP500_DIV_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_DIV_GROWTH_QUARTER,'SP500_DIV_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_PBV_RATIO_QUARTER,'SP500_PBV_RATIO_QUARTER')
df = Merge__Rename_function(df,SHILLER_PE_RATIO_YEAR,'SHILLER_PE_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_PE_RATIO_YEAR,'SP500_PE_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_DIV_YIELD_YEAR,'SP500_DIV_YIELD_YEAR')
df = Merge__Rename_function(df,SP500_PSR_YEAR,'SP500_PSR_YEAR')
df = Merge__Rename_function(df,SP500_EARNINGS_YIELD_YEAR,'SP500_EARNINGS_YIELD_YEAR')
df = Merge__Rename_function(df,SP500_PBV_RATIO_YEAR,'SP500_PBV_RATIO_YEAR')
df = Merge__Rename_function(df,SP500_INFLADJ_YEAR,'SP500_INFLADJ_YEAR')
df = Merge__Rename_function(df,SP500_SALES_YEAR,'SP500_SALES_YEAR')
df = Merge__Rename_function(df,SP500_SALES_GROWTH_YEAR,'SP500_SALES_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_SALES_QUARTER,'SP500_SALES_QUARTER')
df = Merge__Rename_function(df,SP500_REAL_SALES_GROWTH_QUARTER,'SP500_REAL_SALES_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_SALES_GROWTH_QUARTER,'SP500_SALES_GROWTH_QUARTER')
df = Merge__Rename_function(df,SP500_REAL_SALES_GROWTH_YEAR,'SP500_REAL_SALES_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_REAL_EARNINGS_GROWTH_YEAR,'SP500_REAL_EARNINGS_GROWTH_YEAR')
df = Merge__Rename_function(df,SP500_REAL_SALES_YEAR,'SP500_REAL_SALES_YEAR')
df = Merge__Rename_function(df,SP500_REAL_EARNINGS_GROWTH_QUARTER,var_list[27])
df = Merge__Rename_function(df,SP500_EARNINGS_GROWTH_QUARTER,var_list[28])
df = Merge__Rename_function(df,SP500_REAL_SALES_QUARTER,var_list[29])
df = Merge__Rename_function(df,SP500_EARNINGS_MONTH,var_list[30])
df = Merge__Rename_function(df,SP500_BVPS_YEAR,var_list[31])
df = Merge__Rename_function(df,SP500_EARNINGS_YEAR,var_list[32])
df = Merge__Rename_function(df,SP500_EARNINGS_GROWTH_YEAR,var_list[33])
df = Merge__Rename_function(df,SP500_BVPS_QUARTER,var_list[34])
#df = Merge__Rename_function(df,SP500_REAL_PRICE_YEAR,var_list[35])
df.columns
(3553, 10) (3559, 11) (3559, 12) (3559, 13) (3559, 14) (3559, 15) (3559, 16) (3559, 17) (3559, 18) (3559, 19) (3559, 20) (3559, 21) (3559, 22) (3559, 23) (3559, 24) (3559, 25) (3559, 26) (3559, 27) (3559, 28) (3559, 29) (3559, 30) (3559, 31) (3559, 32) (3559, 33) (3559, 34) (3559, 35)
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
'Value_SP500_DIV_YEAR', 'Value_SP500_DIV_GROWTH_YEAR',
'Value_SP500_DIV_GROWTH_QUARTER', 'Value_SP500_PBV_RATIO_QUARTER',
'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_PE_RATIO_YEAR',
'Value_SP500_DIV_YIELD_YEAR', 'Value_SP500_PSR_YEAR',
'Value_SP500_EARNINGS_YIELD_YEAR', 'Value_SP500_PBV_RATIO_YEAR',
'Value_SP500_INFLADJ_YEAR', 'Value_SP500_SALES_YEAR',
'Value_SP500_SALES_GROWTH_YEAR', 'Value_SP500_SALES_QUARTER',
'Value_SP500_REAL_SALES_GROWTH_QUARTER',
'Value_SP500_SALES_GROWTH_QUARTER',
'Value_SP500_REAL_SALES_GROWTH_YEAR',
'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SP500_REAL_SALES_YEAR',
'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_QUARTER',
'Value_SP500_EARNINGS_MONTH', 'Value_SP500_BVPS_YEAR',
'Value_SP500_EARNINGS_YEAR', 'Value_SP500_EARNINGS_GROWTH_YEAR',
'Value_SP500_BVPS_QUARTER'],
dtype='object')
df.head()
| Value_SP500_REAL_PRICE_MONTH | Value_SP500_DIV_YIELD_MONTH | Value_SP500_PE_RATIO_MONTH | Value_SHILLER_PE_RATIO_MONTH | Value_SP500_EARNINGS_YIELD_MONTH | Value_SP500_INFLADJ_MONTH | Value_SP500_PSR_QUARTER | Value_SP500_DIV_MONTH | Value_SP500_DIV_YEAR | Value_SP500_DIV_GROWTH_YEAR | ... | Value_SP500_REAL_EARNINGS_GROWTH_YEAR | Value_SP500_REAL_SALES_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_QUARTER | Value_SP500_EARNINGS_GROWTH_QUARTER | Value_SP500_REAL_SALES_QUARTER | Value_SP500_EARNINGS_MONTH | Value_SP500_BVPS_YEAR | Value_SP500_EARNINGS_YEAR | Value_SP500_EARNINGS_GROWTH_YEAR | Value_SP500_BVPS_QUARTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 1871-01-01 | 4.44 | NaN | 11.10 | NaN | 9.01 | 89.81 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1871-01-31 | NaN | 5.86 | NaN | NaN | NaN | NaN | NaN | 5.26 | 5.15 | NaN | ... | NaN | NaN | NaN | NaN | NaN | 8.09 | NaN | 7.92 | NaN | NaN |
| 1871-02-01 | 4.50 | NaN | 11.25 | 10.92 | 8.89 | 88.33 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1871-02-28 | NaN | 5.78 | NaN | NaN | NaN | NaN | NaN | 5.10 | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 7.85 | NaN | NaN | NaN | NaN |
| 1871-03-01 | 4.61 | NaN | 11.52 | 11.19 | 8.68 | 89.17 | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 35 columns
#Pandas profiling of df
pandas_profiling.ProfileReport(df)
Dataset info
| Number of variables | 36 |
|---|---|
| Number of observations | 3559 |
| Total Missing (%) | 15.2% |
| Total size in memory | 1001.0 KiB |
| Average record size in memory | 288.0 B |
Variables types
| Numeric | 8 |
|---|---|
| Categorical | 0 |
| Boolean | 0 |
| Date | 1 |
| Text (Unique) | 0 |
| Rejected | 27 |
| Unsupported | 0 |
Warnings
Value_SHILLER_PE_RATIO_MONTH has 1787 / 50.2% missing values MissingValue_SHILLER_PE_RATIO_YEAR is highly correlated with Value_SP500_PBV_RATIO_QUARTER (ρ = 1) RejectedValue_SP500_BVPS_QUARTER is highly correlated with Value_SP500_BVPS_YEAR (ρ = 1) RejectedValue_SP500_BVPS_YEAR is highly correlated with Value_SP500_SALES_QUARTER (ρ = 0.9637) RejectedValue_SP500_DIV_GROWTH_QUARTER is highly correlated with Value_SP500_DIV_GROWTH_YEAR (ρ = 1) RejectedValue_SP500_DIV_GROWTH_YEAR has 3522 / 99.0% missing values MissingValue_SP500_DIV_MONTH has 1786 / 50.2% missing values MissingValue_SP500_DIV_YEAR is highly correlated with Value_SP500_DIV_MONTH (ρ = 0.99996) RejectedValue_SP500_DIV_YIELD_MONTH has 1782 / 50.1% missing values MissingValue_SP500_DIV_YIELD_YEAR is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH (ρ = 1) RejectedValue_SP500_EARNINGS_GROWTH_QUARTER is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER (ρ = 0.99905) RejectedValue_SP500_EARNINGS_GROWTH_YEAR is highly correlated with Value_SP500_EARNINGS_GROWTH_QUARTER (ρ = 1) RejectedValue_SP500_EARNINGS_MONTH is highly correlated with Value_SP500_DIV_YEAR (ρ = 0.94144) RejectedValue_SP500_EARNINGS_YEAR is highly correlated with Value_SP500_EARNINGS_MONTH (ρ = 0.99996) RejectedValue_SP500_EARNINGS_YIELD_MONTH is highly correlated with Value_SP500_DIV_YIELD_MONTH (ρ = 1) RejectedValue_SP500_EARNINGS_YIELD_YEAR is highly correlated with Value_SP500_DIV_YIELD_YEAR (ρ = 1) RejectedValue_SP500_INFLADJ_MONTH is highly correlated with Value_SP500_REAL_PRICE_MONTH (ρ = 0.96721) RejectedValue_SP500_INFLADJ_YEAR is highly correlated with Value_SP500_PBV_RATIO_YEAR (ρ = 1) RejectedValue_SP500_PBV_RATIO_QUARTER is highly correlated with Value_SP500_INFLADJ_MONTH (ρ = 1) RejectedValue_SP500_PBV_RATIO_YEAR is highly correlated with Value_SP500_PE_RATIO_YEAR (ρ = 1) RejectedValue_SP500_PE_RATIO_MONTH has 1786 / 50.2% missing values MissingValue_SP500_PE_RATIO_YEAR is highly correlated with Value_SP500_PBV_RATIO_QUARTER (ρ = 1) RejectedValue_SP500_PSR_QUARTER is highly correlated with Value_SP500_INFLADJ_MONTH (ρ = 1) RejectedValue_SP500_PSR_YEAR is highly correlated with Value_SP500_PE_RATIO_YEAR (ρ = 1) RejectedValue_SP500_REAL_EARNINGS_GROWTH_QUARTER is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR (ρ = 1) RejectedValue_SP500_REAL_EARNINGS_GROWTH_YEAR has 3522 / 99.0% missing values MissingValue_SP500_REAL_PRICE_MONTH has 1788 / 50.2% missing values MissingValue_SP500_REAL_SALES_GROWTH_QUARTER is highly correlated with Value_SP500_SALES_GROWTH_YEAR (ρ = 0.99159) RejectedValue_SP500_REAL_SALES_GROWTH_YEAR is highly correlated with Value_SP500_SALES_GROWTH_QUARTER (ρ = 0.99159) RejectedValue_SP500_REAL_SALES_QUARTER is highly correlated with Value_SP500_REAL_SALES_YEAR (ρ = 0.99714) RejectedValue_SP500_REAL_SALES_YEAR is highly correlated with Value_SP500_SALES_QUARTER (ρ = 0.9333) RejectedValue_SP500_SALES_GROWTH_QUARTER is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER (ρ = 0.99183) RejectedValue_SP500_SALES_GROWTH_YEAR has 3534 / 99.3% missing values MissingValue_SP500_SALES_QUARTER is highly correlated with Value_SP500_SALES_YEAR (ρ = 1) RejectedValue_SP500_SALES_YEAR is highly correlated with Value_SP500_DIV_YEAR (ρ = 0.92834) RejectedDate
Date
| Distinct count | 3559 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Minimum | 1871-01-01 00:00:00 |
|---|---|
| Maximum | 2018-12-31 00:00:00 |
Value_SHILLER_PE_RATIO_MONTH
Numeric
| Distinct count | 1189 |
|---|---|
| Unique (%) | 33.4% |
| Missing (%) | 50.2% |
| Missing (n) | 1787 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 16.57 |
|---|---|
| Minimum | 4.78 |
| Maximum | 44.19 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.78 |
|---|---|
| 5-th percentile | 7.982 |
| Q1 | 11.627 |
| Median | 15.67 |
| Q3 | 20.15 |
| 95-th percentile | 27.901 |
| Maximum | 44.19 |
| Range | 39.41 |
| Interquartile range | 8.5225 |
Descriptive statistics
| Standard deviation | 6.6713 |
|---|---|
| Coef of variation | 0.40262 |
| Kurtosis | 1.99 |
| Mean | 16.57 |
| MAD | 5.1052 |
| Skewness | 1.1235 |
| Sum | 29362 |
| Variance | 44.506 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 11.34 | 6 | 0.2% |
|
| 17.82 | 6 | 0.2% |
|
| 16.6 | 5 | 0.1% |
|
| 12.05 | 5 | 0.1% |
|
| 13.8 | 5 | 0.1% |
|
| 17.05 | 4 | 0.1% |
|
| 18.2 | 4 | 0.1% |
|
| 15.27 | 4 | 0.1% |
|
| 16.16 | 4 | 0.1% |
|
| 15.47 | 4 | 0.1% |
|
| Other values (1178) | 1725 | 48.5% |
|
| (Missing) | 1787 | 50.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.78 | 1 | 0.0% |
|
| 5.02 | 1 | 0.0% |
|
| 5.04 | 1 | 0.0% |
|
| 5.08 | 1 | 0.0% |
|
| 5.12 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 43.22 | 1 | 0.0% |
|
| 43.53 | 1 | 0.0% |
|
| 43.77 | 1 | 0.0% |
|
| 43.83 | 1 | 0.0% |
|
| 44.19 | 1 | 0.0% |
|
Value_SHILLER_PE_RATIO_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_BVPS_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_BVPS_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_BVPS_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis
| Correlation | 0.9637 |
|---|
Value_SP500_DIV_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_DIV_GROWTH_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_DIV_GROWTH_YEAR
Numeric
| Distinct count | 34 |
|---|---|
| Unique (%) | 1.0% |
| Missing (%) | 99.0% |
| Missing (n) | 3522 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 6.5478 |
|---|---|
| Minimum | -21.07 |
| Maximum | 18.25 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -21.07 |
|---|---|
| 5-th percentile | -2.668 |
| Q1 | 3.07 |
| Median | 7.07 |
| Q3 | 11.45 |
| 95-th percentile | 16.26 |
| Maximum | 18.25 |
| Range | 39.32 |
| Interquartile range | 8.38 |
Descriptive statistics
| Standard deviation | 6.9429 |
|---|---|
| Coef of variation | 1.0603 |
| Kurtosis | 5.896 |
| Mean | 6.5478 |
| MAD | 4.8668 |
| Skewness | -1.6172 |
| Sum | 242.27 |
| Variance | 48.204 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 7.99 | 2 | 0.1% |
|
| 5.33 | 2 | 0.1% |
|
| 7.07 | 2 | 0.1% |
|
| 16.26 | 2 | 0.1% |
|
| 9.33 | 1 | 0.0% |
|
| 1.49 | 1 | 0.0% |
|
| 8.16 | 1 | 0.0% |
|
| 12.01 | 1 | 0.0% |
|
| 10.0 | 1 | 0.0% |
|
| 11.99 | 1 | 0.0% |
|
| Other values (23) | 23 | 0.6% |
|
| (Missing) | 3522 | 99.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -21.07 | 1 | 0.0% |
|
| -3.26 | 1 | 0.0% |
|
| -2.52 | 1 | 0.0% |
|
| 0.97 | 1 | 0.0% |
|
| 1.45 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 12.72 | 1 | 0.0% |
|
| 13.38 | 1 | 0.0% |
|
| 14.27 | 1 | 0.0% |
|
| 16.26 | 2 | 0.1% |
|
| 18.25 | 1 | 0.0% |
|
Value_SP500_DIV_MONTH
Numeric
| Distinct count | 1119 |
|---|---|
| Unique (%) | 31.4% |
| Missing (%) | 50.2% |
| Missing (n) | 1786 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.415 |
|---|---|
| Minimum | 5.03 |
| Maximum | 52.26 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.03 |
|---|---|
| 5-th percentile | 6.7 |
| Q1 | 8.61 |
| Median | 12.88 |
| Q3 | 19.6 |
| 95-th percentile | 32.694 |
| Maximum | 52.26 |
| Range | 47.23 |
| Interquartile range | 10.99 |
Descriptive statistics
| Standard deviation | 8.7887 |
|---|---|
| Coef of variation | 0.57013 |
| Kurtosis | 3.0931 |
| Mean | 15.415 |
| MAD | 6.7561 |
| Skewness | 1.5803 |
| Sum | 27331 |
| Variance | 77.241 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 7.19 | 9 | 0.3% |
|
| 7.29 | 8 | 0.2% |
|
| 8.09 | 7 | 0.2% |
|
| 6.81 | 7 | 0.2% |
|
| 7.11 | 6 | 0.2% |
|
| 8.4 | 6 | 0.2% |
|
| 8.95 | 6 | 0.2% |
|
| 7.22 | 6 | 0.2% |
|
| 9.11 | 5 | 0.1% |
|
| 7.71 | 5 | 0.1% |
|
| Other values (1108) | 1708 | 48.0% |
|
| (Missing) | 1786 | 50.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.03 | 1 | 0.0% |
|
| 5.04 | 1 | 0.0% |
|
| 5.1 | 1 | 0.0% |
|
| 5.17 | 2 | 0.1% |
|
| 5.18 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 50.75 | 1 | 0.0% |
|
| 51.0 | 1 | 0.0% |
|
| 51.45 | 1 | 0.0% |
|
| 51.87 | 1 | 0.0% |
|
| 52.26 | 1 | 0.0% |
|
Value_SP500_DIV_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis
| Correlation | 0.99996 |
|---|
Value_SP500_DIV_YIELD_MONTH
Numeric
| Distinct count | 608 |
|---|---|
| Unique (%) | 17.1% |
| Missing (%) | 50.1% |
| Missing (n) | 1782 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.3421 |
|---|---|
| Minimum | 1.11 |
| Maximum | 13.84 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.11 |
|---|---|
| 5-th percentile | 1.74 |
| Q1 | 3.15 |
| Median | 4.28 |
| Q3 | 5.39 |
| 95-th percentile | 7.18 |
| Maximum | 13.84 |
| Range | 12.73 |
| Interquartile range | 2.24 |
Descriptive statistics
| Standard deviation | 1.7025 |
|---|---|
| Coef of variation | 0.39208 |
| Kurtosis | 0.83756 |
| Mean | 4.3421 |
| MAD | 1.3453 |
| Skewness | 0.48864 |
| Sum | 7715.9 |
| Variance | 2.8984 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 4.55 | 10 | 0.3% |
|
| 4.22 | 10 | 0.3% |
|
| 5.18 | 10 | 0.3% |
|
| 5.22 | 10 | 0.3% |
|
| 4.43 | 9 | 0.3% |
|
| 3.53 | 9 | 0.3% |
|
| 1.76 | 9 | 0.3% |
|
| 3.87 | 8 | 0.2% |
|
| 4.69 | 8 | 0.2% |
|
| 2.9 | 8 | 0.2% |
|
| Other values (597) | 1686 | 47.4% |
|
| (Missing) | 1782 | 50.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.11 | 2 | 0.1% |
|
| 1.13 | 1 | 0.0% |
|
| 1.14 | 1 | 0.0% |
|
| 1.15 | 1 | 0.0% |
|
| 1.16 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 10.15 | 1 | 0.0% |
|
| 11.36 | 1 | 0.0% |
|
| 12.46 | 1 | 0.0% |
|
| 12.64 | 1 | 0.0% |
|
| 13.84 | 1 | 0.0% |
|
Value_SP500_DIV_YIELD_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.99905 |
|---|
Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_EARNINGS_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_EARNINGS_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis
| Correlation | 0.94144 |
|---|
Value_SP500_EARNINGS_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_EARNINGS_MONTH and should be ignored for analysis
| Correlation | 0.99996 |
|---|
Value_SP500_EARNINGS_YIELD_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YIELD_MONTH and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_EARNINGS_YIELD_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YIELD_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_INFLADJ_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis
| Correlation | 0.96721 |
|---|
Value_SP500_INFLADJ_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PBV_RATIO_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_PBV_RATIO_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_PBV_RATIO_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PE_RATIO_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_PE_RATIO_MONTH
Numeric
| Distinct count | 1117 |
|---|---|
| Unique (%) | 31.4% |
| Missing (%) | 50.2% |
| Missing (n) | 1786 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.725 |
|---|---|
| Minimum | 5.31 |
| Maximum | 123.73 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.31 |
|---|---|
| 5-th percentile | 7.83 |
| Q1 | 11.45 |
| Median | 14.73 |
| Q3 | 18.04 |
| 95-th percentile | 25.798 |
| Maximum | 123.73 |
| Range | 118.42 |
| Interquartile range | 6.59 |
Descriptive statistics
| Standard deviation | 8.4055 |
|---|---|
| Coef of variation | 0.53453 |
| Kurtosis | 67.915 |
| Mean | 15.725 |
| MAD | 4.6369 |
| Skewness | 6.4631 |
| Sum | 27880 |
| Variance | 70.653 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 15.61 | 6 | 0.2% |
|
| 19.0 | 5 | 0.1% |
|
| 13.82 | 5 | 0.1% |
|
| 9.84 | 5 | 0.1% |
|
| 7.97 | 5 | 0.1% |
|
| 11.48 | 5 | 0.1% |
|
| 12.21 | 5 | 0.1% |
|
| 17.48 | 5 | 0.1% |
|
| 17.83 | 5 | 0.1% |
|
| 12.56 | 4 | 0.1% |
|
| Other values (1106) | 1723 | 48.4% |
|
| (Missing) | 1786 | 50.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.31 | 1 | 0.0% |
|
| 5.41 | 1 | 0.0% |
|
| 5.74 | 1 | 0.0% |
|
| 5.81 | 1 | 0.0% |
|
| 5.82 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 101.87 | 1 | 0.0% |
|
| 110.37 | 1 | 0.0% |
|
| 119.85 | 1 | 0.0% |
|
| 123.32 | 1 | 0.0% |
|
| 123.73 | 1 | 0.0% |
|
Value_SP500_PE_RATIO_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_PSR_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_PSR_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PE_RATIO_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric
| Distinct count | 36 |
|---|---|
| Unique (%) | 1.0% |
| Missing (%) | 99.0% |
| Missing (n) | 3522 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 12.684 |
|---|---|
| Minimum | -79.48 |
| Maximum | 261.66 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -79.48 |
|---|---|
| 5-th percentile | -32.784 |
| Q1 | 0.3 |
| Median | 10.69 |
| Q3 | 14.93 |
| 95-th percentile | 54.266 |
| Maximum | 261.66 |
| Range | 341.14 |
| Interquartile range | 14.63 |
Descriptive statistics
| Standard deviation | 48.913 |
|---|---|
| Coef of variation | 3.8564 |
| Kurtosis | 19.396 |
| Mean | 12.684 |
| MAD | 21.87 |
| Skewness | 3.637 |
| Sum | 469.29 |
| Variance | 2392.5 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 8.86 | 2 | 0.1% |
|
| 7.81 | 2 | 0.1% |
|
| -6.49 | 1 | 0.0% |
|
| 15.9 | 1 | 0.0% |
|
| -11.6 | 1 | 0.0% |
|
| -79.48 | 1 | 0.0% |
|
| 10.69 | 1 | 0.0% |
|
| 11.38 | 1 | 0.0% |
|
| 0.36 | 1 | 0.0% |
|
| -51.84 | 1 | 0.0% |
|
| Other values (25) | 25 | 0.7% |
|
| (Missing) | 3522 | 99.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -79.48 | 1 | 0.0% |
|
| -51.84 | 1 | 0.0% |
|
| -28.02 | 1 | 0.0% |
|
| -21.05 | 1 | 0.0% |
|
| -15.56 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 24.85 | 1 | 0.0% |
|
| 36.07 | 1 | 0.0% |
|
| 49.72 | 1 | 0.0% |
|
| 72.45 | 1 | 0.0% |
|
| 261.66 | 1 | 0.0% |
|
Value_SP500_REAL_PRICE_MONTH
Numeric
| Distinct count | 1401 |
|---|---|
| Unique (%) | 39.4% |
| Missing (%) | 50.2% |
| Missing (n) | 1788 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 262.57 |
|---|---|
| Minimum | 2.73 |
| Maximum | 2789.8 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2.73 |
|---|---|
| 5-th percentile | 4.4 |
| Q1 | 7.745 |
| Median | 16.5 |
| Q3 | 123.65 |
| 95-th percentile | 1420.4 |
| Maximum | 2789.8 |
| Range | 2787.1 |
| Interquartile range | 115.91 |
Descriptive statistics
| Standard deviation | 523.67 |
|---|---|
| Coef of variation | 1.9944 |
| Kurtosis | 5.389 |
| Mean | 262.57 |
| MAD | 359.68 |
| Skewness | 2.4198 |
| Sum | 465010 |
| Variance | 274230 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 4.37 | 7 | 0.2% |
|
| 4.46 | 7 | 0.2% |
|
| 5.3 | 6 | 0.2% |
|
| 7.68 | 6 | 0.2% |
|
| 5.18 | 6 | 0.2% |
|
| 5.32 | 6 | 0.2% |
|
| 4.34 | 5 | 0.1% |
|
| 4.38 | 5 | 0.1% |
|
| 8.12 | 5 | 0.1% |
|
| 4.65 | 5 | 0.1% |
|
| Other values (1390) | 1713 | 48.1% |
|
| (Missing) | 1788 | 50.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.73 | 1 | 0.0% |
|
| 2.85 | 1 | 0.0% |
|
| 2.94 | 2 | 0.1% |
|
| 3.05 | 1 | 0.0% |
|
| 3.17 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2702.77 | 1 | 0.0% |
|
| 2705.16 | 1 | 0.0% |
|
| 2736.61 | 1 | 0.0% |
|
| 2754.35 | 1 | 0.0% |
|
| 2789.8 | 1 | 0.0% |
|
Value_SP500_REAL_SALES_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.99159 |
|---|
Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.99159 |
|---|
Value_SP500_REAL_SALES_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis
| Correlation | 0.99714 |
|---|
Value_SP500_REAL_SALES_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis
| Correlation | 0.9333 |
|---|
Value_SP500_SALES_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.99183 |
|---|
Value_SP500_SALES_GROWTH_YEAR
Numeric
| Distinct count | 22 |
|---|---|
| Unique (%) | 0.6% |
| Missing (%) | 99.3% |
| Missing (n) | 3534 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.2676 |
|---|---|
| Minimum | -12.86 |
| Maximum | 10.93 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -12.86 |
|---|---|
| 5-th percentile | -7.382 |
| Q1 | 2.09 |
| Median | 5.68 |
| Q3 | 7.68 |
| 95-th percentile | 10.576 |
| Maximum | 10.93 |
| Range | 23.79 |
| Interquartile range | 5.59 |
Descriptive statistics
| Standard deviation | 5.775 |
|---|---|
| Coef of variation | 1.3532 |
| Kurtosis | 2.4996 |
| Mean | 4.2676 |
| MAD | 4.2259 |
| Skewness | -1.5198 |
| Sum | 106.69 |
| Variance | 33.351 |
| Memory size | 27.9 KiB |
| Value | Count | Frequency (%) | |
| 2.09 | 2 | 0.1% |
|
| 9.36 | 2 | 0.1% |
|
| 7.68 | 2 | 0.1% |
|
| 7.03 | 2 | 0.1% |
|
| 1.7 | 1 | 0.0% |
|
| 5.68 | 1 | 0.0% |
|
| 5.37 | 1 | 0.0% |
|
| 10.93 | 1 | 0.0% |
|
| 4.16 | 1 | 0.0% |
|
| -1.18 | 1 | 0.0% |
|
| Other values (11) | 11 | 0.3% |
|
| (Missing) | 3534 | 99.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -12.86 | 1 | 0.0% |
|
| -8.45 | 1 | 0.0% |
|
| -3.11 | 1 | 0.0% |
|
| -1.18 | 1 | 0.0% |
|
| 1.7 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 8.94 | 1 | 0.0% |
|
| 9.03 | 1 | 0.0% |
|
| 9.36 | 2 | 0.1% |
|
| 10.88 | 1 | 0.0% |
|
| 10.93 | 1 | 0.0% |
|
Value_SP500_SALES_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_SALES_YEAR and should be ignored for analysis
| Correlation | 1 |
|---|
Value_SP500_SALES_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis
| Correlation | 0.92834 |
|---|
| Value_SP500_REAL_PRICE_MONTH | Value_SP500_DIV_YIELD_MONTH | Value_SP500_PE_RATIO_MONTH | Value_SHILLER_PE_RATIO_MONTH | Value_SP500_EARNINGS_YIELD_MONTH | Value_SP500_INFLADJ_MONTH | Value_SP500_PSR_QUARTER | Value_SP500_DIV_MONTH | Value_SP500_DIV_YEAR | Value_SP500_DIV_GROWTH_YEAR | Value_SP500_DIV_GROWTH_QUARTER | Value_SP500_PBV_RATIO_QUARTER | Value_SHILLER_PE_RATIO_YEAR | Value_SP500_PE_RATIO_YEAR | Value_SP500_DIV_YIELD_YEAR | Value_SP500_PSR_YEAR | Value_SP500_EARNINGS_YIELD_YEAR | Value_SP500_PBV_RATIO_YEAR | Value_SP500_INFLADJ_YEAR | Value_SP500_SALES_YEAR | Value_SP500_SALES_GROWTH_YEAR | Value_SP500_SALES_QUARTER | Value_SP500_REAL_SALES_GROWTH_QUARTER | Value_SP500_SALES_GROWTH_QUARTER | Value_SP500_REAL_SALES_GROWTH_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_YEAR | Value_SP500_REAL_SALES_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_QUARTER | Value_SP500_EARNINGS_GROWTH_QUARTER | Value_SP500_REAL_SALES_QUARTER | Value_SP500_EARNINGS_MONTH | Value_SP500_BVPS_YEAR | Value_SP500_EARNINGS_YEAR | Value_SP500_EARNINGS_GROWTH_YEAR | Value_SP500_BVPS_QUARTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||||||||||||||||
| 1871-01-01 | 4.44 | NaN | 11.10 | NaN | 9.01 | 89.81 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 11.1 | NaN | NaN | 9.01 | NaN | 89.81 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1871-01-31 | NaN | 5.86 | NaN | NaN | NaN | NaN | NaN | 5.26 | 5.15 | NaN | NaN | NaN | NaN | NaN | 5.86 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 8.09 | NaN | 7.92 | NaN | NaN |
| 1871-02-01 | 4.50 | NaN | 11.25 | 10.92 | 8.89 | 88.33 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1871-02-28 | NaN | 5.78 | NaN | NaN | NaN | NaN | NaN | 5.10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 7.85 | NaN | NaN | NaN | NaN |
| 1871-03-01 | 4.61 | NaN | 11.52 | 11.19 | 8.68 | 89.17 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
def Column_missing_values(df):
'''
Function to find missing values or NaN values per column and print percentage of missing values per column
in bar chart.
IN - any pandas dataframe
'''
#Percentage of missing values per column
missing_columns = df.isnull().sum()
if missing_columns.values.any() != 0:
missing_columns = (missing_columns[missing_columns>0]/df.shape[0]) * 100
missing_columns.sort_values(inplace=True)
missing_columns.plot.bar(title = 'Column wise percentage missing values', figsize=(8,4))
else:
print('No missing values in provided dataframe')
return
Column_missing_values(df)
#Correlation plot for numerical encoded version of API data in df.
sns.heatmap(df.corr(),annot=None,fmt='.2f',square=False)
<matplotlib.axes._subplots.AxesSubplot at 0x23b2a9a5ba8>
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(df)
plt.show()
#Drop last rows with NaN values for Value_SP500_REAL_PRICE_MONTH because these were outlier with no price values.
df.drop(df.tail(11).index,axis=0,inplace=True)
Original dataset has many missing or NaN values which is noise to end results. General methods like replacing with mean values can not be applied because it would add bias to the timeseries dataset.
import impyute.imputation.cs
import impyute.imputation.ts
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer, MissingIndicator
from statsmodels.tsa.arima_model import ARIMA
from sklearn.model_selection import cross_val_score
#Create an imputation object
imputer_most_frequent= SimpleImputer(missing_values=np.nan,strategy ='most_frequent')
#Inject imputed values in the dataset.
df_imputed = pd.DataFrame(imputer_most_frequent.fit_transform(df))
df_imputed.columns = df.columns
df_imputed.index = df.index
pandas_profiling.ProfileReport(df_imputed)
Dataset info
| Number of variables | 36 |
|---|---|
| Number of observations | 3548 |
| Total Missing (%) | 0.0% |
| Total size in memory | 998.0 KiB |
| Average record size in memory | 288.0 B |
Variables types
| Numeric | 25 |
|---|---|
| Categorical | 0 |
| Boolean | 0 |
| Date | 1 |
| Text (Unique) | 0 |
| Rejected | 10 |
| Unsupported | 0 |
Warnings
Value_SP500_BVPS_QUARTER is highly correlated with Value_SP500_REAL_SALES_QUARTER (ρ = 0.951) RejectedValue_SP500_BVPS_YEAR is highly correlated with Value_SP500_REAL_SALES_YEAR (ρ = 0.9145) RejectedValue_SP500_EARNINGS_GROWTH_QUARTER is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER (ρ = 0.99834) RejectedValue_SP500_EARNINGS_GROWTH_YEAR is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR (ρ = 0.9994) RejectedValue_SP500_EARNINGS_MONTH is highly correlated with Value_SP500_DIV_MONTH (ρ = 0.94595) RejectedValue_SP500_EARNINGS_YEAR is highly correlated with Value_SP500_DIV_YEAR (ρ = 0.94718) RejectedValue_SP500_INFLADJ_MONTH is highly correlated with Value_SP500_REAL_PRICE_MONTH (ρ = 0.95138) RejectedValue_SP500_PBV_RATIO_YEAR is highly skewed (γ1 = 23.544) SkewedValue_SP500_PSR_YEAR is highly skewed (γ1 = 20.076) SkewedValue_SP500_REAL_EARNINGS_GROWTH_QUARTER is highly skewed (γ1 = 32.842) SkewedValue_SP500_REAL_SALES_GROWTH_YEAR is highly correlated with Value_SP500_SALES_GROWTH_YEAR (ρ = 0.99905) RejectedValue_SP500_REAL_SALES_QUARTER is highly correlated with Value_SP500_SALES_QUARTER (ρ = 0.98094) RejectedValue_SP500_REAL_SALES_YEAR is highly correlated with Value_SP500_REAL_SALES_GROWTH_YEAR (ρ = 0.91447) RejectedDate
Date
| Distinct count | 3548 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Minimum | 1871-01-01 00:00:00 |
|---|---|
| Maximum | 2018-12-31 00:00:00 |
Value_SHILLER_PE_RATIO_MONTH
Numeric
| Distinct count | 1188 |
|---|---|
| Unique (%) | 33.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 13.952 |
|---|---|
| Minimum | 4.78 |
| Maximum | 44.19 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.78 |
|---|---|
| 5-th percentile | 9.26 |
| Q1 | 11.34 |
| Median | 11.34 |
| Q3 | 15.66 |
| 95-th percentile | 25.41 |
| Maximum | 44.19 |
| Range | 39.41 |
| Interquartile range | 4.32 |
Descriptive statistics
| Standard deviation | 5.3909 |
|---|---|
| Coef of variation | 0.38639 |
| Kurtosis | 6.0424 |
| Mean | 13.952 |
| MAD | 3.9298 |
| Skewness | 2.1774 |
| Sum | 49501 |
| Variance | 29.061 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 11.34 | 1782 | 50.2% |
|
| 17.82 | 6 | 0.2% |
|
| 13.8 | 5 | 0.1% |
|
| 16.6 | 5 | 0.1% |
|
| 12.05 | 5 | 0.1% |
|
| 16.83 | 4 | 0.1% |
|
| 10.0 | 4 | 0.1% |
|
| 10.91 | 4 | 0.1% |
|
| 15.47 | 4 | 0.1% |
|
| 18.96 | 4 | 0.1% |
|
| Other values (1178) | 1725 | 48.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.78 | 1 | 0.0% |
|
| 5.02 | 1 | 0.0% |
|
| 5.04 | 1 | 0.0% |
|
| 5.08 | 1 | 0.0% |
|
| 5.12 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 43.22 | 1 | 0.0% |
|
| 43.53 | 1 | 0.0% |
|
| 43.77 | 1 | 0.0% |
|
| 43.83 | 1 | 0.0% |
|
| 44.19 | 1 | 0.0% |
|
Value_SHILLER_PE_RATIO_YEAR
Numeric
| Distinct count | 141 |
|---|---|
| Unique (%) | 4.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 12.108 |
|---|---|
| Minimum | 5.12 |
| Maximum | 43.77 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.12 |
|---|---|
| 5-th percentile | 11.9 |
| Q1 | 11.9 |
| Median | 11.9 |
| Q3 | 11.9 |
| 95-th percentile | 11.9 |
| Maximum | 43.77 |
| Range | 38.65 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 1.7274 |
|---|---|
| Coef of variation | 0.14267 |
| Kurtosis | 112.09 |
| Mean | 12.108 |
| MAD | 0.45349 |
| Skewness | 9.3367 |
| Sum | 42958 |
| Variance | 2.9838 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 11.9 | 3402 | 95.9% |
|
| 17.22 | 2 | 0.1% |
|
| 9.26 | 2 | 0.1% |
|
| 17.09 | 2 | 0.1% |
|
| 18.47 | 2 | 0.1% |
|
| 20.32 | 2 | 0.1% |
|
| 13.9 | 2 | 0.1% |
|
| 27.21 | 1 | 0.0% |
|
| 22.9 | 1 | 0.0% |
|
| 20.97 | 1 | 0.0% |
|
| Other values (131) | 131 | 3.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.12 | 1 | 0.0% |
|
| 5.99 | 1 | 0.0% |
|
| 6.1 | 1 | 0.0% |
|
| 6.29 | 1 | 0.0% |
|
| 6.64 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 33.31 | 1 | 0.0% |
|
| 33.36 | 1 | 0.0% |
|
| 36.98 | 1 | 0.0% |
|
| 40.57 | 1 | 0.0% |
|
| 43.77 | 1 | 0.0% |
|
Value_SP500_BVPS_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_QUARTER and should be ignored for analysis
| Correlation | 0.951 |
|---|
Value_SP500_BVPS_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis
| Correlation | 0.9145 |
|---|
Value_SP500_DIV_GROWTH_QUARTER
Numeric
| Distinct count | 113 |
|---|---|
| Unique (%) | 3.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 11.696 |
|---|---|
| Minimum | -21.07 |
| Maximum | 18.25 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -21.07 |
|---|---|
| 5-th percentile | 11.89 |
| Q1 | 11.89 |
| Median | 11.89 |
| Q3 | 11.89 |
| 95-th percentile | 11.89 |
| Maximum | 18.25 |
| Range | 39.32 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 1.6985 |
|---|---|
| Coef of variation | 0.14523 |
| Kurtosis | 144.27 |
| Mean | 11.696 |
| MAD | 0.4133 |
| Skewness | -10.499 |
| Sum | 41496 |
| Variance | 2.8849 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 11.89 | 3434 | 96.8% |
|
| 12.65 | 2 | 0.1% |
|
| 12.49 | 2 | 0.1% |
|
| 10.46 | 1 | 0.0% |
|
| 18.25 | 1 | 0.0% |
|
| 5.05 | 1 | 0.0% |
|
| 1.55 | 1 | 0.0% |
|
| -4.71 | 1 | 0.0% |
|
| 1.26 | 1 | 0.0% |
|
| 8.02 | 1 | 0.0% |
|
| Other values (103) | 103 | 2.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -21.07 | 1 | 0.0% |
|
| -19.63 | 1 | 0.0% |
|
| -17.17 | 1 | 0.0% |
|
| -13.9 | 1 | 0.0% |
|
| -10.86 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 16.74 | 1 | 0.0% |
|
| 17.4 | 1 | 0.0% |
|
| 17.47 | 1 | 0.0% |
|
| 17.51 | 1 | 0.0% |
|
| 18.25 | 1 | 0.0% |
|
Value_SP500_DIV_GROWTH_YEAR
Numeric
| Distinct count | 32 |
|---|---|
| Unique (%) | 0.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -20.824 |
|---|---|
| Minimum | -21.07 |
| Maximum | 18.25 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -21.07 |
|---|---|
| 5-th percentile | -21.07 |
| Q1 | -21.07 |
| Median | -21.07 |
| Q3 | -21.07 |
| 95-th percentile | -21.07 |
| Maximum | 18.25 |
| Range | 39.32 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 2.6645 |
|---|---|
| Coef of variation | -0.12795 |
| Kurtosis | 125.17 |
| Mean | -20.824 |
| MAD | 0.48728 |
| Skewness | 11.107 |
| Sum | -73884 |
| Variance | 7.0994 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -21.07 | 3517 | 99.1% |
|
| 6.99 | 1 | 0.0% |
|
| 3.07 | 1 | 0.0% |
|
| 10.0 | 1 | 0.0% |
|
| 1.49 | 1 | 0.0% |
|
| 9.33 | 1 | 0.0% |
|
| 12.01 | 1 | 0.0% |
|
| 11.99 | 1 | 0.0% |
|
| 0.97 | 1 | 0.0% |
|
| 2.12 | 1 | 0.0% |
|
| Other values (22) | 22 | 0.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -21.07 | 3517 | 99.1% |
|
| -3.26 | 1 | 0.0% |
|
| -2.52 | 1 | 0.0% |
|
| 0.97 | 1 | 0.0% |
|
| 1.45 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 12.72 | 1 | 0.0% |
|
| 13.38 | 1 | 0.0% |
|
| 14.27 | 1 | 0.0% |
|
| 16.26 | 1 | 0.0% |
|
| 18.25 | 1 | 0.0% |
|
Value_SP500_DIV_MONTH
Numeric
| Distinct count | 1118 |
|---|---|
| Unique (%) | 31.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 11.3 |
|---|---|
| Minimum | 5.03 |
| Maximum | 52.26 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.03 |
|---|---|
| 5-th percentile | 7.11 |
| Q1 | 7.19 |
| Median | 7.19 |
| Q3 | 12.843 |
| 95-th percentile | 24.97 |
| Maximum | 52.26 |
| Range | 47.23 |
| Interquartile range | 5.6525 |
Descriptive statistics
| Standard deviation | 7.4502 |
|---|---|
| Coef of variation | 0.6593 |
| Kurtosis | 7.1196 |
| Mean | 11.3 |
| MAD | 5.3875 |
| Skewness | 2.448 |
| Sum | 40093 |
| Variance | 55.506 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 7.19 | 1784 | 50.3% |
|
| 7.29 | 8 | 0.2% |
|
| 6.81 | 7 | 0.2% |
|
| 8.09 | 7 | 0.2% |
|
| 8.4 | 6 | 0.2% |
|
| 8.95 | 6 | 0.2% |
|
| 7.22 | 6 | 0.2% |
|
| 7.11 | 6 | 0.2% |
|
| 6.94 | 5 | 0.1% |
|
| 7.05 | 5 | 0.1% |
|
| Other values (1108) | 1708 | 48.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.03 | 1 | 0.0% |
|
| 5.04 | 1 | 0.0% |
|
| 5.1 | 1 | 0.0% |
|
| 5.17 | 2 | 0.1% |
|
| 5.18 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 50.75 | 1 | 0.0% |
|
| 51.0 | 1 | 0.0% |
|
| 51.45 | 1 | 0.0% |
|
| 51.87 | 1 | 0.0% |
|
| 52.26 | 1 | 0.0% |
|
Value_SP500_DIV_YEAR
Numeric
| Distinct count | 146 |
|---|---|
| Unique (%) | 4.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 7.1088 |
|---|---|
| Minimum | 5.04 |
| Maximum | 52.26 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.04 |
|---|---|
| 5-th percentile | 6.7 |
| Q1 | 6.7 |
| Median | 6.7 |
| Q3 | 6.7 |
| 95-th percentile | 6.7 |
| Maximum | 52.26 |
| Range | 47.22 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 2.8765 |
|---|---|
| Coef of variation | 0.40463 |
| Kurtosis | 118.18 |
| Mean | 7.1088 |
| MAD | 0.79031 |
| Skewness | 9.9101 |
| Sum | 25222 |
| Variance | 8.2741 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 6.7 | 3398 | 95.8% |
|
| 7.11 | 2 | 0.1% |
|
| 6.81 | 2 | 0.1% |
|
| 7.61 | 2 | 0.1% |
|
| 15.89 | 2 | 0.1% |
|
| 12.17 | 2 | 0.1% |
|
| 24.21 | 1 | 0.0% |
|
| 9.18 | 1 | 0.0% |
|
| 8.07 | 1 | 0.0% |
|
| 16.97 | 1 | 0.0% |
|
| Other values (136) | 136 | 3.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.04 | 1 | 0.0% |
|
| 5.15 | 1 | 0.0% |
|
| 5.18 | 1 | 0.0% |
|
| 5.2 | 1 | 0.0% |
|
| 5.55 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 48.17 | 1 | 0.0% |
|
| 50.0 | 1 | 0.0% |
|
| 50.03 | 1 | 0.0% |
|
| 51.02 | 1 | 0.0% |
|
| 52.26 | 1 | 0.0% |
|
Value_SP500_DIV_YIELD_MONTH
Numeric
| Distinct count | 607 |
|---|---|
| Unique (%) | 17.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.2812 |
|---|---|
| Minimum | 1.11 |
| Maximum | 13.84 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.11 |
|---|---|
| 5-th percentile | 1.96 |
| Q1 | 4.22 |
| Median | 4.22 |
| Q3 | 4.28 |
| 95-th percentile | 6.57 |
| Maximum | 13.84 |
| Range | 12.73 |
| Interquartile range | 0.06 |
Descriptive statistics
| Standard deviation | 1.2062 |
|---|---|
| Coef of variation | 0.28175 |
| Kurtosis | 4.7742 |
| Mean | 4.2812 |
| MAD | 0.70379 |
| Skewness | 0.83882 |
| Sum | 15190 |
| Variance | 1.455 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 4.22 | 1781 | 50.2% |
|
| 5.22 | 10 | 0.3% |
|
| 4.55 | 10 | 0.3% |
|
| 5.18 | 10 | 0.3% |
|
| 4.43 | 9 | 0.3% |
|
| 3.53 | 9 | 0.3% |
|
| 1.76 | 9 | 0.3% |
|
| 4.17 | 8 | 0.2% |
|
| 2.93 | 8 | 0.2% |
|
| 4.1 | 8 | 0.2% |
|
| Other values (597) | 1686 | 47.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.11 | 2 | 0.1% |
|
| 1.13 | 1 | 0.0% |
|
| 1.14 | 1 | 0.0% |
|
| 1.15 | 1 | 0.0% |
|
| 1.16 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 10.15 | 1 | 0.0% |
|
| 11.36 | 1 | 0.0% |
|
| 12.46 | 1 | 0.0% |
|
| 12.64 | 1 | 0.0% |
|
| 13.84 | 1 | 0.0% |
|
Value_SP500_DIV_YIELD_YEAR
Numeric
| Distinct count | 137 |
|---|---|
| Unique (%) | 3.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.0815 |
|---|---|
| Minimum | 1.17 |
| Maximum | 10.15 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.17 |
|---|---|
| 5-th percentile | 4.07 |
| Q1 | 4.07 |
| Median | 4.07 |
| Q3 | 4.07 |
| 95-th percentile | 4.07 |
| Maximum | 10.15 |
| Range | 8.98 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.36379 |
|---|---|
| Coef of variation | 0.089132 |
| Kurtosis | 79.461 |
| Mean | 4.0815 |
| MAD | 0.069995 |
| Skewness | 4.2121 |
| Sum | 14481 |
| Variance | 0.13234 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 4.07 | 3400 | 95.8% |
|
| 6.2 | 2 | 0.1% |
|
| 5.41 | 2 | 0.1% |
|
| 4.4 | 2 | 0.1% |
|
| 3.49 | 2 | 0.1% |
|
| 3.53 | 2 | 0.1% |
|
| 3.81 | 2 | 0.1% |
|
| 1.76 | 2 | 0.1% |
|
| 1.61 | 2 | 0.1% |
|
| 5.71 | 2 | 0.1% |
|
| Other values (127) | 130 | 3.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.17 | 1 | 0.0% |
|
| 1.22 | 1 | 0.0% |
|
| 1.36 | 1 | 0.0% |
|
| 1.37 | 1 | 0.0% |
|
| 1.61 | 2 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 7.49 | 1 | 0.0% |
|
| 8.11 | 1 | 0.0% |
|
| 8.38 | 1 | 0.0% |
|
| 9.72 | 1 | 0.0% |
|
| 10.15 | 1 | 0.0% |
|
Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.99834 |
|---|
Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.9994 |
|---|
Value_SP500_EARNINGS_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis
| Correlation | 0.94595 |
|---|
Value_SP500_EARNINGS_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis
| Correlation | 0.94718 |
|---|
Value_SP500_EARNINGS_YIELD_MONTH
Numeric
| Distinct count | 792 |
|---|---|
| Unique (%) | 22.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 6.4929 |
|---|---|
| Minimum | 0.81 |
| Maximum | 18.82 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.81 |
|---|---|
| 5-th percentile | 4.45 |
| Q1 | 5.62 |
| Median | 5.62 |
| Q3 | 6.79 |
| 95-th percentile | 10.95 |
| Maximum | 18.82 |
| Range | 18.01 |
| Interquartile range | 1.17 |
Descriptive statistics
| Standard deviation | 2.1019 |
|---|---|
| Coef of variation | 0.32372 |
| Kurtosis | 5.3453 |
| Mean | 6.4929 |
| MAD | 1.4629 |
| Skewness | 2.0606 |
| Sum | 23037 |
| Variance | 4.418 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 5.62 | 1786 | 50.3% |
|
| 5.72 | 10 | 0.3% |
|
| 5.26 | 10 | 0.3% |
|
| 5.3 | 9 | 0.3% |
|
| 5.29 | 9 | 0.3% |
|
| 5.61 | 8 | 0.2% |
|
| 7.24 | 8 | 0.2% |
|
| 5.53 | 8 | 0.2% |
|
| 5.55 | 8 | 0.2% |
|
| 5.54 | 8 | 0.2% |
|
| Other values (782) | 1684 | 47.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.81 | 2 | 0.1% |
|
| 0.83 | 1 | 0.0% |
|
| 0.91 | 1 | 0.0% |
|
| 0.98 | 1 | 0.0% |
|
| 1.08 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 17.18 | 1 | 0.0% |
|
| 17.21 | 1 | 0.0% |
|
| 17.42 | 1 | 0.0% |
|
| 18.48 | 1 | 0.0% |
|
| 18.82 | 1 | 0.0% |
|
Value_SP500_EARNINGS_YIELD_YEAR
Numeric
| Distinct count | 137 |
|---|---|
| Unique (%) | 3.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 5.4126 |
|---|---|
| Minimum | 1.41 |
| Maximum | 17.42 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.41 |
|---|---|
| 5-th percentile | 5.33 |
| Q1 | 5.33 |
| Median | 5.33 |
| Q3 | 5.33 |
| 95-th percentile | 5.33 |
| Maximum | 17.42 |
| Range | 16.01 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.67946 |
|---|---|
| Coef of variation | 0.12553 |
| Kurtosis | 105.43 |
| Mean | 5.4126 |
| MAD | 0.17808 |
| Skewness | 9.0727 |
| Sum | 19204 |
| Variance | 0.46167 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 5.33 | 3401 | 95.9% |
|
| 5.53 | 3 | 0.1% |
|
| 8.46 | 3 | 0.1% |
|
| 5.55 | 2 | 0.1% |
|
| 6.57 | 2 | 0.1% |
|
| 7.42 | 2 | 0.1% |
|
| 4.44 | 2 | 0.1% |
|
| 7.35 | 2 | 0.1% |
|
| 11.08 | 2 | 0.1% |
|
| 9.87 | 2 | 0.1% |
|
| Other values (127) | 127 | 3.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.41 | 1 | 0.0% |
|
| 2.17 | 1 | 0.0% |
|
| 3.04 | 1 | 0.0% |
|
| 3.18 | 1 | 0.0% |
|
| 3.44 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 13.53 | 1 | 0.0% |
|
| 13.84 | 1 | 0.0% |
|
| 15.1 | 1 | 0.0% |
|
| 15.77 | 1 | 0.0% |
|
| 17.42 | 1 | 0.0% |
|
Value_SP500_INFLADJ_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis
| Correlation | 0.95138 |
|---|
Value_SP500_INFLADJ_YEAR
Numeric
| Distinct count | 150 |
|---|---|
| Unique (%) | 4.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 101.53 |
|---|---|
| Minimum | 81.79 |
| Maximum | 2914.2 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 81.79 |
|---|---|
| 5-th percentile | 81.79 |
| Q1 | 81.79 |
| Median | 81.79 |
| Q3 | 81.79 |
| 95-th percentile | 81.79 |
| Maximum | 2914.2 |
| Range | 2832.4 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 157.9 |
|---|---|
| Coef of variation | 1.5552 |
| Kurtosis | 149.94 |
| Mean | 101.53 |
| MAD | 37.848 |
| Skewness | 11.449 |
| Sum | 360230 |
| Variance | 24932 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 81.79 | 3399 | 95.8% |
|
| 178.12 | 1 | 0.0% |
|
| 103.6 | 1 | 0.0% |
|
| 739.63 | 1 | 0.0% |
|
| 609.48 | 1 | 0.0% |
|
| 188.58 | 1 | 0.0% |
|
| 178.98 | 1 | 0.0% |
|
| 176.02 | 1 | 0.0% |
|
| 2041.06 | 1 | 0.0% |
|
| 161.59 | 1 | 0.0% |
|
| Other values (140) | 140 | 3.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 81.79 | 3399 | 95.8% |
|
| 88.75 | 1 | 0.0% |
|
| 89.81 | 1 | 0.0% |
|
| 94.32 | 1 | 0.0% |
|
| 94.95 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2187.26 | 1 | 0.0% |
|
| 2361.3 | 1 | 0.0% |
|
| 2758.77 | 1 | 0.0% |
|
| 2836.75 | 1 | 0.0% |
|
| 2914.22 | 1 | 0.0% |
|
Value_SP500_PBV_RATIO_QUARTER
Numeric
| Distinct count | 61 |
|---|---|
| Unique (%) | 1.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.5849 |
|---|---|
| Minimum | 1.78 |
| Maximum | 5.06 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.78 |
|---|---|
| 5-th percentile | 2.58 |
| Q1 | 2.58 |
| Median | 2.58 |
| Q3 | 2.58 |
| 95-th percentile | 2.58 |
| Maximum | 5.06 |
| Range | 3.28 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.10134 |
|---|---|
| Coef of variation | 0.039203 |
| Kurtosis | 309.88 |
| Mean | 2.5849 |
| MAD | 0.015266 |
| Skewness | 14.136 |
| Sum | 9171.1 |
| Variance | 0.010269 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 2.58 | 3473 | 97.9% |
|
| 2.91 | 3 | 0.1% |
|
| 2.76 | 3 | 0.1% |
|
| 2.77 | 3 | 0.1% |
|
| 2.43 | 2 | 0.1% |
|
| 3.03 | 2 | 0.1% |
|
| 2.83 | 2 | 0.1% |
|
| 2.67 | 2 | 0.1% |
|
| 2.78 | 2 | 0.1% |
|
| 2.74 | 2 | 0.1% |
|
| Other values (51) | 54 | 1.5% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.78 | 1 | 0.0% |
|
| 1.85 | 1 | 0.0% |
|
| 1.89 | 1 | 0.0% |
|
| 1.9 | 1 | 0.0% |
|
| 2.0 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 4.05 | 1 | 0.0% |
|
| 4.49 | 1 | 0.0% |
|
| 4.65 | 1 | 0.0% |
|
| 5.05 | 1 | 0.0% |
|
| 5.06 | 1 | 0.0% |
|
Value_SP500_PBV_RATIO_YEAR
Numeric
| Distinct count | 19 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 2.1744 |
|---|---|
| Minimum | 2 |
| Maximum | 5.05 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2 |
|---|---|
| 5-th percentile | 2.17 |
| Q1 | 2.17 |
| Median | 2.17 |
| Q3 | 2.17 |
| 95-th percentile | 2.17 |
| Maximum | 5.05 |
| Range | 3.05 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.078208 |
|---|---|
| Coef of variation | 0.035968 |
| Kurtosis | 682.74 |
| Mean | 2.1744 |
| MAD | 0.0089232 |
| Skewness | 23.544 |
| Sum | 7714.7 |
| Variance | 0.0061165 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 2.17 | 3529 | 99.5% |
|
| 2.76 | 2 | 0.1% |
|
| 2.14 | 1 | 0.0% |
|
| 2.91 | 1 | 0.0% |
|
| 3.5 | 1 | 0.0% |
|
| 2.73 | 1 | 0.0% |
|
| 2.77 | 1 | 0.0% |
|
| 2.81 | 1 | 0.0% |
|
| 2.58 | 1 | 0.0% |
|
| 3.39 | 1 | 0.0% |
|
| Other values (9) | 9 | 0.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.0 | 1 | 0.0% |
|
| 2.05 | 1 | 0.0% |
|
| 2.14 | 1 | 0.0% |
|
| 2.17 | 3529 | 99.5% |
|
| 2.58 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 3.3 | 1 | 0.0% |
|
| 3.39 | 1 | 0.0% |
|
| 3.5 | 1 | 0.0% |
|
| 4.05 | 1 | 0.0% |
|
| 5.05 | 1 | 0.0% |
|
Value_SP500_PE_RATIO_MONTH
Numeric
| Distinct count | 1116 |
|---|---|
| Unique (%) | 31.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.667 |
|---|---|
| Minimum | 5.31 |
| Maximum | 123.73 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.31 |
|---|---|
| 5-th percentile | 9.13 |
| Q1 | 14.73 |
| Median | 15.61 |
| Q3 | 15.61 |
| 95-th percentile | 22.479 |
| Maximum | 123.73 |
| Range | 118.42 |
| Interquartile range | 0.88 |
Descriptive statistics
| Standard deviation | 5.9414 |
|---|---|
| Coef of variation | 0.37922 |
| Kurtosis | 139.05 |
| Mean | 15.667 |
| MAD | 2.3416 |
| Skewness | 9.1666 |
| Sum | 55588 |
| Variance | 35.3 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 15.61 | 1781 | 50.2% |
|
| 11.48 | 5 | 0.1% |
|
| 12.21 | 5 | 0.1% |
|
| 17.48 | 5 | 0.1% |
|
| 9.84 | 5 | 0.1% |
|
| 19.0 | 5 | 0.1% |
|
| 17.83 | 5 | 0.1% |
|
| 7.97 | 5 | 0.1% |
|
| 13.82 | 5 | 0.1% |
|
| 9.13 | 4 | 0.1% |
|
| Other values (1106) | 1723 | 48.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.31 | 1 | 0.0% |
|
| 5.41 | 1 | 0.0% |
|
| 5.74 | 1 | 0.0% |
|
| 5.81 | 1 | 0.0% |
|
| 5.82 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 101.87 | 1 | 0.0% |
|
| 110.37 | 1 | 0.0% |
|
| 119.85 | 1 | 0.0% |
|
| 123.32 | 1 | 0.0% |
|
| 123.73 | 1 | 0.0% |
|
Value_SP500_PE_RATIO_YEAR
Numeric
| Distinct count | 142 |
|---|---|
| Unique (%) | 4.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 11.989 |
|---|---|
| Minimum | 5.74 |
| Maximum | 70.91 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.74 |
|---|---|
| 5-th percentile | 11.82 |
| Q1 | 11.82 |
| Median | 11.82 |
| Q3 | 11.82 |
| 95-th percentile | 11.82 |
| Maximum | 70.91 |
| Range | 65.17 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 1.6921 |
|---|---|
| Coef of variation | 0.14114 |
| Kurtosis | 487.59 |
| Mean | 11.989 |
| MAD | 0.37918 |
| Skewness | 17.568 |
| Sum | 42537 |
| Variance | 2.8634 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 11.82 | 3401 | 95.9% |
|
| 13.48 | 2 | 0.1% |
|
| 18.01 | 2 | 0.1% |
|
| 7.97 | 2 | 0.1% |
|
| 9.02 | 2 | 0.1% |
|
| 10.13 | 2 | 0.1% |
|
| 18.77 | 2 | 0.1% |
|
| 17.81 | 1 | 0.0% |
|
| 19.33 | 1 | 0.0% |
|
| 19.99 | 1 | 0.0% |
|
| Other values (132) | 132 | 3.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.74 | 1 | 0.0% |
|
| 6.34 | 1 | 0.0% |
|
| 6.62 | 1 | 0.0% |
|
| 7.22 | 1 | 0.0% |
|
| 7.39 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 29.04 | 1 | 0.0% |
|
| 31.43 | 1 | 0.0% |
|
| 32.92 | 1 | 0.0% |
|
| 46.17 | 1 | 0.0% |
|
| 70.91 | 1 | 0.0% |
|
Value_SP500_PSR_QUARTER
Numeric
| Distinct count | 55 |
|---|---|
| Unique (%) | 1.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.432 |
|---|---|
| Minimum | 0.8 |
| Maximum | 2.31 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.8 |
|---|---|
| 5-th percentile | 1.43 |
| Q1 | 1.43 |
| Median | 1.43 |
| Q3 | 1.43 |
| 95-th percentile | 1.43 |
| Maximum | 2.31 |
| Range | 1.51 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.048445 |
|---|---|
| Coef of variation | 0.03383 |
| Kurtosis | 154.51 |
| Mean | 1.432 |
| MAD | 0.0071731 |
| Skewness | 8.0394 |
| Sum | 5080.8 |
| Variance | 0.002347 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 1.43 | 3479 | 98.1% |
|
| 1.44 | 3 | 0.1% |
|
| 1.54 | 3 | 0.1% |
|
| 1.31 | 3 | 0.1% |
|
| 1.19 | 2 | 0.1% |
|
| 1.77 | 2 | 0.1% |
|
| 1.52 | 2 | 0.1% |
|
| 1.27 | 2 | 0.1% |
|
| 2.1 | 2 | 0.1% |
|
| 1.66 | 2 | 0.1% |
|
| Other values (45) | 48 | 1.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.8 | 1 | 0.0% |
|
| 0.87 | 1 | 0.0% |
|
| 0.97 | 1 | 0.0% |
|
| 1.08 | 1 | 0.0% |
|
| 1.1 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2.1 | 2 | 0.1% |
|
| 2.13 | 1 | 0.0% |
|
| 2.17 | 1 | 0.0% |
|
| 2.25 | 1 | 0.0% |
|
| 2.31 | 1 | 0.0% |
|
Value_SP500_PSR_YEAR
Numeric
| Distinct count | 16 |
|---|---|
| Unique (%) | 0.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.3116 |
|---|---|
| Minimum | 0.87 |
| Maximum | 2.31 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.87 |
|---|---|
| 5-th percentile | 1.31 |
| Q1 | 1.31 |
| Median | 1.31 |
| Q3 | 1.31 |
| 95-th percentile | 1.31 |
| Maximum | 2.31 |
| Range | 1.44 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.033841 |
|---|---|
| Coef of variation | 0.025802 |
| Kurtosis | 506.53 |
| Mean | 1.3116 |
| MAD | 0.003505 |
| Skewness | 20.076 |
| Sum | 4653.5 |
| Variance | 0.0011452 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 1.31 | 3530 | 99.5% |
|
| 1.56 | 2 | 0.1% |
|
| 1.43 | 2 | 0.1% |
|
| 1.77 | 2 | 0.1% |
|
| 1.66 | 1 | 0.0% |
|
| 0.87 | 1 | 0.0% |
|
| 1.81 | 1 | 0.0% |
|
| 2.13 | 1 | 0.0% |
|
| 1.3 | 1 | 0.0% |
|
| 1.95 | 1 | 0.0% |
|
| Other values (6) | 6 | 0.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.87 | 1 | 0.0% |
|
| 1.19 | 1 | 0.0% |
|
| 1.23 | 1 | 0.0% |
|
| 1.3 | 1 | 0.0% |
|
| 1.31 | 3530 | 99.5% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1.81 | 1 | 0.0% |
|
| 1.95 | 1 | 0.0% |
|
| 2.13 | 1 | 0.0% |
|
| 2.17 | 1 | 0.0% |
|
| 2.31 | 1 | 0.0% |
|
Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Numeric
| Distinct count | 114 |
|---|---|
| Unique (%) | 3.2% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 11.791 |
|---|---|
| Minimum | -90.27 |
| Maximum | 905.56 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -90.27 |
|---|---|
| 5-th percentile | 11.38 |
| Q1 | 11.38 |
| Median | 11.38 |
| Q3 | 11.38 |
| 95-th percentile | 11.38 |
| Maximum | 905.56 |
| Range | 995.83 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 23.282 |
|---|---|
| Coef of variation | 1.9746 |
| Kurtosis | 1188.3 |
| Mean | 11.791 |
| MAD | 1.7031 |
| Skewness | 32.842 |
| Sum | 41833 |
| Variance | 542.04 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 11.38 | 3435 | 96.8% |
|
| 17.8 | 1 | 0.0% |
|
| -7.72 | 1 | 0.0% |
|
| -31.14 | 1 | 0.0% |
|
| 0.55 | 1 | 0.0% |
|
| -44.15 | 1 | 0.0% |
|
| 11.18 | 1 | 0.0% |
|
| -21.05 | 1 | 0.0% |
|
| -86.8 | 1 | 0.0% |
|
| 15.28 | 1 | 0.0% |
|
| Other values (104) | 104 | 2.9% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -90.27 | 1 | 0.0% |
|
| -86.8 | 1 | 0.0% |
|
| -79.48 | 1 | 0.0% |
|
| -73.86 | 1 | 0.0% |
|
| -51.84 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 72.45 | 1 | 0.0% |
|
| 261.66 | 1 | 0.0% |
|
| 492.53 | 1 | 0.0% |
|
| 870.4 | 1 | 0.0% |
|
| 905.56 | 1 | 0.0% |
|
Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric
| Distinct count | 33 |
|---|---|
| Unique (%) | 0.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -78.62 |
|---|---|
| Minimum | -79.48 |
| Maximum | 261.66 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -79.48 |
|---|---|
| 5-th percentile | -79.48 |
| Q1 | -79.48 |
| Median | -79.48 |
| Q3 | -79.48 |
| 95-th percentile | -79.48 |
| Maximum | 261.66 |
| Range | 341.14 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 10.149 |
|---|---|
| Coef of variation | -0.12909 |
| Kurtosis | 419.42 |
| Mean | -78.62 |
| MAD | 1.7038 |
| Skewness | 17.108 |
| Sum | -278940 |
| Variance | 103.01 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -79.48 | 3516 | 99.1% |
|
| 17.8 | 1 | 0.0% |
|
| 8.1 | 1 | 0.0% |
|
| 261.66 | 1 | 0.0% |
|
| -28.02 | 1 | 0.0% |
|
| 49.72 | 1 | 0.0% |
|
| 0.36 | 1 | 0.0% |
|
| -6.49 | 1 | 0.0% |
|
| 11.38 | 1 | 0.0% |
|
| 10.69 | 1 | 0.0% |
|
| Other values (23) | 23 | 0.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -79.48 | 3516 | 99.1% |
|
| -51.84 | 1 | 0.0% |
|
| -28.02 | 1 | 0.0% |
|
| -21.05 | 1 | 0.0% |
|
| -15.56 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 24.85 | 1 | 0.0% |
|
| 36.07 | 1 | 0.0% |
|
| 49.72 | 1 | 0.0% |
|
| 72.45 | 1 | 0.0% |
|
| 261.66 | 1 | 0.0% |
|
Value_SP500_REAL_PRICE_MONTH
Numeric
| Distinct count | 1400 |
|---|---|
| Unique (%) | 39.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 133.25 |
|---|---|
| Minimum | 2.73 |
| Maximum | 2789.8 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2.73 |
|---|---|
| 5-th percentile | 4.37 |
| Q1 | 4.37 |
| Median | 4.37 |
| Q3 | 16.293 |
| 95-th percentile | 1149.6 |
| Maximum | 2789.8 |
| Range | 2787.1 |
| Interquartile range | 11.922 |
Descriptive statistics
| Standard deviation | 391.81 |
|---|---|
| Coef of variation | 2.9404 |
| Kurtosis | 14.745 |
| Mean | 133.25 |
| MAD | 208.57 |
| Skewness | 3.7646 |
| Sum | 472780 |
| Variance | 153520 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 4.37 | 1784 | 50.3% |
|
| 4.46 | 7 | 0.2% |
|
| 5.32 | 6 | 0.2% |
|
| 5.3 | 6 | 0.2% |
|
| 5.18 | 6 | 0.2% |
|
| 7.68 | 6 | 0.2% |
|
| 4.59 | 5 | 0.1% |
|
| 5.51 | 5 | 0.1% |
|
| 4.65 | 5 | 0.1% |
|
| 8.12 | 5 | 0.1% |
|
| Other values (1390) | 1713 | 48.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.73 | 1 | 0.0% |
|
| 2.85 | 1 | 0.0% |
|
| 2.94 | 2 | 0.1% |
|
| 3.05 | 1 | 0.0% |
|
| 3.17 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2702.77 | 1 | 0.0% |
|
| 2705.16 | 1 | 0.0% |
|
| 2736.61 | 1 | 0.0% |
|
| 2754.35 | 1 | 0.0% |
|
| 2789.8 | 1 | 0.0% |
|
Value_SP500_REAL_SALES_GROWTH_QUARTER
Numeric
| Distinct count | 66 |
|---|---|
| Unique (%) | 1.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -1.7438 |
|---|---|
| Minimum | -15.81 |
| Maximum | 8.89 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -15.81 |
|---|---|
| 5-th percentile | -1.8 |
| Q1 | -1.8 |
| Median | -1.8 |
| Q3 | -1.8 |
| 95-th percentile | -1.8 |
| Maximum | 8.89 |
| Range | 24.7 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.86472 |
|---|---|
| Coef of variation | -0.49588 |
| Kurtosis | 96.112 |
| Mean | -1.7438 |
| MAD | 0.15904 |
| Skewness | 3.3383 |
| Sum | -6187 |
| Variance | 0.74774 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -1.8 | 3483 | 98.2% |
|
| 7.95 | 1 | 0.0% |
|
| 7.72 | 1 | 0.0% |
|
| -6.85 | 1 | 0.0% |
|
| 5.72 | 1 | 0.0% |
|
| 7.23 | 1 | 0.0% |
|
| 1.13 | 1 | 0.0% |
|
| 2.88 | 1 | 0.0% |
|
| 4.39 | 1 | 0.0% |
|
| 5.09 | 1 | 0.0% |
|
| Other values (56) | 56 | 1.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -15.81 | 1 | 0.0% |
|
| -12.69 | 1 | 0.0% |
|
| -12.66 | 1 | 0.0% |
|
| -9.91 | 1 | 0.0% |
|
| -8.35 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 7.53 | 1 | 0.0% |
|
| 7.72 | 1 | 0.0% |
|
| 7.95 | 1 | 0.0% |
|
| 8.87 | 1 | 0.0% |
|
| 8.89 | 1 | 0.0% |
|
Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.99905 |
|---|
Value_SP500_REAL_SALES_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis
| Correlation | 0.98094 |
|---|
Value_SP500_REAL_SALES_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.91447 |
|---|
Value_SP500_SALES_GROWTH_QUARTER
Numeric
| Distinct count | 67 |
|---|---|
| Unique (%) | 1.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -16.087 |
|---|---|
| Minimum | -16.46 |
| Maximum | 12.56 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -16.46 |
|---|---|
| 5-th percentile | -16.46 |
| Q1 | -16.46 |
| Median | -16.46 |
| Q3 | -16.46 |
| 95-th percentile | -16.46 |
| Maximum | 12.56 |
| Range | 29.02 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 2.8312 |
|---|---|
| Coef of variation | -0.176 |
| Kurtosis | 61.813 |
| Mean | -16.087 |
| MAD | 0.73302 |
| Skewness | 7.8448 |
| Sum | -57075 |
| Variance | 8.0156 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -16.46 | 3482 | 98.1% |
|
| -2.47 | 1 | 0.0% |
|
| 1.7 | 1 | 0.0% |
|
| 0.2 | 1 | 0.0% |
|
| -8.45 | 1 | 0.0% |
|
| 8.94 | 1 | 0.0% |
|
| 7.62 | 1 | 0.0% |
|
| 9.96 | 1 | 0.0% |
|
| 8.04 | 1 | 0.0% |
|
| 4.81 | 1 | 0.0% |
|
| Other values (57) | 57 | 1.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -16.46 | 3482 | 98.1% |
|
| -12.86 | 1 | 0.0% |
|
| -11.95 | 1 | 0.0% |
|
| -8.45 | 1 | 0.0% |
|
| -7.97 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 10.93 | 1 | 0.0% |
|
| 11.01 | 1 | 0.0% |
|
| 11.29 | 1 | 0.0% |
|
| 11.99 | 1 | 0.0% |
|
| 12.56 | 1 | 0.0% |
|
Value_SP500_SALES_GROWTH_YEAR
Numeric
| Distinct count | 21 |
|---|---|
| Unique (%) | 0.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -12.761 |
|---|---|
| Minimum | -12.86 |
| Maximum | 10.93 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -12.86 |
|---|---|
| 5-th percentile | -12.86 |
| Q1 | -12.86 |
| Median | -12.86 |
| Q3 | -12.86 |
| 95-th percentile | -12.86 |
| Maximum | 10.93 |
| Range | 23.79 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 1.3605 |
|---|---|
| Coef of variation | -0.10661 |
| Kurtosis | 208.16 |
| Mean | -12.761 |
| MAD | 0.19651 |
| Skewness | 14.282 |
| Sum | -45277 |
| Variance | 1.8508 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -12.86 | 3528 | 99.4% |
|
| 2.24 | 1 | 0.0% |
|
| 5.98 | 1 | 0.0% |
|
| 5.37 | 1 | 0.0% |
|
| 10.93 | 1 | 0.0% |
|
| 4.16 | 1 | 0.0% |
|
| -1.18 | 1 | 0.0% |
|
| 7.03 | 1 | 0.0% |
|
| 1.7 | 1 | 0.0% |
|
| 3.76 | 1 | 0.0% |
|
| Other values (11) | 11 | 0.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -12.86 | 3528 | 99.4% |
|
| -8.45 | 1 | 0.0% |
|
| -3.11 | 1 | 0.0% |
|
| -1.18 | 1 | 0.0% |
|
| 1.7 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 8.94 | 1 | 0.0% |
|
| 9.03 | 1 | 0.0% |
|
| 9.36 | 1 | 0.0% |
|
| 10.88 | 1 | 0.0% |
|
| 10.93 | 1 | 0.0% |
|
Value_SP500_SALES_QUARTER
Numeric
| Distinct count | 71 |
|---|---|
| Unique (%) | 2.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 680.53 |
|---|---|
| Minimum | 674.59 |
| Maximum | 1292.8 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 674.59 |
|---|---|
| 5-th percentile | 674.59 |
| Q1 | 674.59 |
| Median | 674.59 |
| Q3 | 674.59 |
| 95-th percentile | 674.59 |
| Maximum | 1292.8 |
| Range | 618.25 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 48.068 |
|---|---|
| Coef of variation | 0.070634 |
| Kurtosis | 79.358 |
| Mean | 680.53 |
| MAD | 11.646 |
| Skewness | 8.7691 |
| Sum | 2414500 |
| Variance | 2310.6 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 674.59 | 3478 | 98.0% |
|
| 1127.13 | 1 | 0.0% |
|
| 917.93 | 1 | 0.0% |
|
| 998.54 | 1 | 0.0% |
|
| 1185.81 | 1 | 0.0% |
|
| 828.1 | 1 | 0.0% |
|
| 1136.16 | 1 | 0.0% |
|
| 738.81 | 1 | 0.0% |
|
| 965.19 | 1 | 0.0% |
|
| 981.21 | 1 | 0.0% |
|
| Other values (61) | 61 | 1.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 674.59 | 3478 | 98.0% |
|
| 678.6 | 1 | 0.0% |
|
| 684.42 | 1 | 0.0% |
|
| 697.75 | 1 | 0.0% |
|
| 697.9 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1185.81 | 1 | 0.0% |
|
| 1203.1 | 1 | 0.0% |
|
| 1231.57 | 1 | 0.0% |
|
| 1259.18 | 1 | 0.0% |
|
| 1292.84 | 1 | 0.0% |
|
Value_SP500_SALES_YEAR
Numeric
| Distinct count | 22 |
|---|---|
| Unique (%) | 0.6% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 676.69 |
|---|---|
| Minimum | 674.59 |
| Maximum | 1292.8 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 674.59 |
|---|---|
| 5-th percentile | 674.59 |
| Q1 | 674.59 |
| Median | 674.59 |
| Q3 | 674.59 |
| 95-th percentile | 674.59 |
| Maximum | 1292.8 |
| Range | 618.25 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 30.328 |
|---|---|
| Coef of variation | 0.044818 |
| Kurtosis | 255.59 |
| Mean | 676.69 |
| MAD | 4.169 |
| Skewness | 15.665 |
| Sum | 2400900 |
| Variance | 919.76 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 674.59 | 3527 | 99.4% |
|
| 1127.13 | 1 | 0.0% |
|
| 1092.37 | 1 | 0.0% |
|
| 1203.1 | 1 | 0.0% |
|
| 1292.84 | 1 | 0.0% |
|
| 1169.42 | 1 | 0.0% |
|
| 1163.32 | 1 | 0.0% |
|
| 1042.46 | 1 | 0.0% |
|
| 874.32 | 1 | 0.0% |
|
| 710.81 | 1 | 0.0% |
|
| Other values (12) | 12 | 0.3% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 674.59 | 3527 | 99.4% |
|
| 710.81 | 1 | 0.0% |
|
| 736.88 | 1 | 0.0% |
|
| 745.7 | 1 | 0.0% |
|
| 788.17 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1169.42 | 1 | 0.0% |
|
| 1203.1 | 1 | 0.0% |
|
| 1231.57 | 1 | 0.0% |
|
| 1259.18 | 1 | 0.0% |
|
| 1292.84 | 1 | 0.0% |
|
| Value_SP500_REAL_PRICE_MONTH | Value_SP500_DIV_YIELD_MONTH | Value_SP500_PE_RATIO_MONTH | Value_SHILLER_PE_RATIO_MONTH | Value_SP500_EARNINGS_YIELD_MONTH | Value_SP500_INFLADJ_MONTH | Value_SP500_PSR_QUARTER | Value_SP500_DIV_MONTH | Value_SP500_DIV_YEAR | Value_SP500_DIV_GROWTH_YEAR | Value_SP500_DIV_GROWTH_QUARTER | Value_SP500_PBV_RATIO_QUARTER | Value_SHILLER_PE_RATIO_YEAR | Value_SP500_PE_RATIO_YEAR | Value_SP500_DIV_YIELD_YEAR | Value_SP500_PSR_YEAR | Value_SP500_EARNINGS_YIELD_YEAR | Value_SP500_PBV_RATIO_YEAR | Value_SP500_INFLADJ_YEAR | Value_SP500_SALES_YEAR | Value_SP500_SALES_GROWTH_YEAR | Value_SP500_SALES_QUARTER | Value_SP500_REAL_SALES_GROWTH_QUARTER | Value_SP500_SALES_GROWTH_QUARTER | Value_SP500_REAL_SALES_GROWTH_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_YEAR | Value_SP500_REAL_SALES_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_QUARTER | Value_SP500_EARNINGS_GROWTH_QUARTER | Value_SP500_REAL_SALES_QUARTER | Value_SP500_EARNINGS_MONTH | Value_SP500_BVPS_YEAR | Value_SP500_EARNINGS_YEAR | Value_SP500_EARNINGS_GROWTH_YEAR | Value_SP500_BVPS_QUARTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||||||||||||||||
| 1871-01-01 | 4.44 | 4.22 | 11.10 | 11.34 | 9.01 | 89.81 | 1.43 | 7.19 | 6.70 | -21.07 | 11.89 | 2.58 | 11.9 | 11.10 | 4.07 | 1.31 | 9.01 | 2.17 | 89.81 | 674.59 | -12.86 | 674.59 | -1.8 | -16.46 | -12.69 | -79.48 | 943.84 | 11.38 | 17.75 | 942.41 | 8.83 | 290.68 | 7.95 | -77.52 | 290.68 |
| 1871-01-31 | 4.37 | 5.86 | 15.61 | 11.34 | 5.62 | 99.41 | 1.43 | 5.26 | 5.15 | -21.07 | 11.89 | 2.58 | 11.9 | 11.82 | 5.86 | 1.31 | 5.33 | 2.17 | 81.79 | 674.59 | -12.86 | 674.59 | -1.8 | -16.46 | -12.69 | -79.48 | 943.84 | 11.38 | 17.75 | 942.41 | 8.09 | 290.68 | 7.92 | -77.52 | 290.68 |
| 1871-02-01 | 4.50 | 4.22 | 11.25 | 10.92 | 8.89 | 88.33 | 1.43 | 7.19 | 6.70 | -21.07 | 11.89 | 2.58 | 11.9 | 11.82 | 4.07 | 1.31 | 5.33 | 2.17 | 81.79 | 674.59 | -12.86 | 674.59 | -1.8 | -16.46 | -12.69 | -79.48 | 943.84 | 11.38 | 17.75 | 942.41 | 8.83 | 290.68 | 7.95 | -77.52 | 290.68 |
| 1871-02-28 | 4.37 | 5.78 | 15.61 | 11.34 | 5.62 | 99.41 | 1.43 | 5.10 | 6.70 | -21.07 | 11.89 | 2.58 | 11.9 | 11.82 | 4.07 | 1.31 | 5.33 | 2.17 | 81.79 | 674.59 | -12.86 | 674.59 | -1.8 | -16.46 | -12.69 | -79.48 | 943.84 | 11.38 | 17.75 | 942.41 | 7.85 | 290.68 | 7.95 | -77.52 | 290.68 |
| 1871-03-01 | 4.61 | 4.22 | 11.52 | 11.19 | 8.68 | 89.17 | 1.43 | 7.19 | 6.70 | -21.07 | 11.89 | 2.58 | 11.9 | 11.82 | 4.07 | 1.31 | 5.33 | 2.17 | 81.79 | 674.59 | -12.86 | 674.59 | -1.8 | -16.46 | -12.69 | -79.48 | 943.84 | 11.38 | 17.75 | 942.41 | 8.83 | 290.68 | 7.95 | -77.52 | 290.68 |
#Apply linear interpolation on the dataset.
df_interpolate = df.interpolate(method='linear',axis=0,inplace=False,limit_direction='both')
print(df_imputed.shape)
print(df_interpolate.shape)
(3548, 35) (3548, 35)
#Full profile on interpolated dataset.
pandas_profiling.ProfileReport(df_interpolate)
Dataset info
| Number of variables | 36 |
|---|---|
| Number of observations | 3548 |
| Total Missing (%) | 0.0% |
| Total size in memory | 998.0 KiB |
| Average record size in memory | 288.0 B |
Variables types
| Numeric | 13 |
|---|---|
| Categorical | 0 |
| Boolean | 0 |
| Date | 1 |
| Text (Unique) | 0 |
| Rejected | 22 |
| Unsupported | 0 |
Warnings
Value_SHILLER_PE_RATIO_YEAR is highly correlated with Value_SHILLER_PE_RATIO_MONTH (ρ = 0.98989) RejectedValue_SP500_BVPS_QUARTER is highly correlated with Value_SP500_BVPS_YEAR (ρ = 0.99959) RejectedValue_SP500_BVPS_YEAR is highly correlated with Value_SP500_SALES_QUARTER (ρ = 0.97772) RejectedValue_SP500_DIV_GROWTH_QUARTER is highly correlated with Value_SP500_DIV_GROWTH_YEAR (ρ = 0.99279) RejectedValue_SP500_DIV_MONTH is highly correlated with Value_SP500_INFLADJ_MONTH (ρ = 0.91991) RejectedValue_SP500_DIV_YEAR is highly correlated with Value_SP500_DIV_MONTH (ρ = 0.99956) RejectedValue_SP500_DIV_YIELD_YEAR is highly correlated with Value_SP500_DIV_YIELD_MONTH (ρ = 0.97677) RejectedValue_SP500_EARNINGS_GROWTH_QUARTER is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER (ρ = 0.99925) RejectedValue_SP500_EARNINGS_GROWTH_YEAR is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR (ρ = 0.99856) RejectedValue_SP500_EARNINGS_MONTH is highly correlated with Value_SP500_INFLADJ_YEAR (ρ = 0.9169) RejectedValue_SP500_EARNINGS_YEAR is highly correlated with Value_SP500_EARNINGS_MONTH (ρ = 0.99519) RejectedValue_SP500_EARNINGS_YIELD_YEAR is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH (ρ = 0.98185) RejectedValue_SP500_INFLADJ_MONTH is highly correlated with Value_SP500_REAL_PRICE_MONTH (ρ = 0.9681) RejectedValue_SP500_INFLADJ_YEAR is highly correlated with Value_SP500_DIV_YEAR (ρ = 0.92035) RejectedValue_SP500_PBV_RATIO_YEAR is highly correlated with Value_SP500_PBV_RATIO_QUARTER (ρ = 0.99857) RejectedValue_SP500_PSR_YEAR is highly correlated with Value_SP500_PSR_QUARTER (ρ = 0.98853) RejectedValue_SP500_REAL_SALES_GROWTH_QUARTER is highly correlated with Value_SP500_SALES_GROWTH_YEAR (ρ = 0.9797) RejectedValue_SP500_REAL_SALES_GROWTH_YEAR is highly correlated with Value_SP500_SALES_GROWTH_QUARTER (ρ = 0.96667) RejectedValue_SP500_REAL_SALES_QUARTER is highly correlated with Value_SP500_REAL_SALES_YEAR (ρ = 0.99537) RejectedValue_SP500_REAL_SALES_YEAR is highly correlated with Value_SP500_SALES_QUARTER (ρ = 0.90087) RejectedValue_SP500_SALES_GROWTH_QUARTER is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER (ρ = 0.99017) RejectedValue_SP500_SALES_QUARTER is highly correlated with Value_SP500_SALES_YEAR (ρ = 0.99943) RejectedDate
Date
| Distinct count | 3548 |
|---|---|
| Unique (%) | 100.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Minimum | 1871-01-01 00:00:00 |
|---|---|
| Maximum | 2018-12-31 00:00:00 |
Value_SHILLER_PE_RATIO_MONTH
Numeric
| Distinct count | 2354 |
|---|---|
| Unique (%) | 66.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 16.578 |
|---|---|
| Minimum | 4.78 |
| Maximum | 44.19 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 4.78 |
|---|---|
| 5-th percentile | 7.9768 |
| Q1 | 11.62 |
| Median | 15.655 |
| Q3 | 20.152 |
| 95-th percentile | 28.102 |
| Maximum | 44.19 |
| Range | 39.41 |
| Interquartile range | 8.5325 |
Descriptive statistics
| Standard deviation | 6.676 |
|---|---|
| Coef of variation | 0.4027 |
| Kurtosis | 1.9619 |
| Mean | 16.578 |
| MAD | 5.1107 |
| Skewness | 1.1209 |
| Sum | 58820 |
| Variance | 44.57 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 13.8 | 7 | 0.2% |
|
| 11.34 | 7 | 0.2% |
|
| 17.65 | 7 | 0.2% |
|
| 15.23 | 7 | 0.2% |
|
| 17.82 | 7 | 0.2% |
|
| 12.43 | 6 | 0.2% |
|
| 10.91 | 6 | 0.2% |
|
| 11.64 | 6 | 0.2% |
|
| 15.27 | 6 | 0.2% |
|
| 18.07 | 6 | 0.2% |
|
| Other values (2344) | 3483 | 98.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 4.78 | 1 | 0.0% |
|
| 4.95 | 1 | 0.0% |
|
| 4.955 | 1 | 0.0% |
|
| 5.02 | 1 | 0.0% |
|
| 5.04 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 43.7 | 1 | 0.0% |
|
| 43.77 | 1 | 0.0% |
|
| 43.83 | 1 | 0.0% |
|
| 43.980000000000004 | 1 | 0.0% |
|
| 44.19 | 1 | 0.0% |
|
Value_SHILLER_PE_RATIO_YEAR
Highly correlated
This variable is highly correlated with Value_SHILLER_PE_RATIO_MONTH and should be ignored for analysis
| Correlation | 0.98989 |
|---|
Value_SP500_BVPS_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_BVPS_YEAR and should be ignored for analysis
| Correlation | 0.99959 |
|---|
Value_SP500_BVPS_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis
| Correlation | 0.97772 |
|---|
Value_SP500_DIV_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_DIV_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.99279 |
|---|
Value_SP500_DIV_GROWTH_YEAR
Numeric
| Distinct count | 689 |
|---|---|
| Unique (%) | 19.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 11.924 |
|---|---|
| Minimum | -21.07 |
| Maximum | 18.25 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -21.07 |
|---|---|
| 5-th percentile | 2.3661 |
| Q1 | 13.38 |
| Median | 13.38 |
| Q3 | 13.38 |
| 95-th percentile | 13.38 |
| Maximum | 18.25 |
| Range | 39.32 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 4.2003 |
|---|---|
| Coef of variation | 0.35224 |
| Kurtosis | 15.155 |
| Mean | 11.924 |
| MAD | 2.5096 |
| Skewness | -3.5077 |
| Sum | 42308 |
| Variance | 17.643 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 13.38 | 2856 | 80.5% |
|
| 8.65 | 4 | 0.1% |
|
| 10.68 | 2 | 0.1% |
|
| 13.4225 | 1 | 0.0% |
|
| 4.31625 | 1 | 0.0% |
|
| 16.26 | 1 | 0.0% |
|
| 11.246666666666666 | 1 | 0.0% |
|
| 13.151666666666667 | 1 | 0.0% |
|
| 17.255000000000003 | 1 | 0.0% |
|
| 12.135 | 1 | 0.0% |
|
| Other values (679) | 679 | 19.1% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -21.07 | 1 | 0.0% |
|
| -20.131666666666668 | 1 | 0.0% |
|
| -20.09375 | 1 | 0.0% |
|
| -19.193333333333335 | 1 | 0.0% |
|
| -19.1175 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 17.989166666666666 | 1 | 0.0% |
|
| 18.00125 | 1 | 0.0% |
|
| 18.08416666666667 | 1 | 0.0% |
|
| 18.167083333333334 | 1 | 0.0% |
|
| 18.25 | 1 | 0.0% |
|
Value_SP500_DIV_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_INFLADJ_MONTH and should be ignored for analysis
| Correlation | 0.91991 |
|---|
Value_SP500_DIV_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_MONTH and should be ignored for analysis
| Correlation | 0.99956 |
|---|
Value_SP500_DIV_YIELD_MONTH
Numeric
| Distinct count | 1295 |
|---|---|
| Unique (%) | 36.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.3469 |
|---|---|
| Minimum | 1.11 |
| Maximum | 13.84 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.11 |
|---|---|
| 5-th percentile | 1.74 |
| Q1 | 3.1588 |
| Median | 4.285 |
| Q3 | 5.39 |
| 95-th percentile | 7.18 |
| Maximum | 13.84 |
| Range | 12.73 |
| Interquartile range | 2.2312 |
Descriptive statistics
| Standard deviation | 1.6983 |
|---|---|
| Coef of variation | 0.39069 |
| Kurtosis | 0.77793 |
| Mean | 4.3469 |
| MAD | 1.3423 |
| Skewness | 0.47674 |
| Sum | 15423 |
| Variance | 2.8842 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 5.18 | 15 | 0.4% |
|
| 5.01 | 13 | 0.4% |
|
| 1.94 | 12 | 0.3% |
|
| 1.76 | 12 | 0.3% |
|
| 5.22 | 12 | 0.3% |
|
| 3.7 | 11 | 0.3% |
|
| 4.22 | 11 | 0.3% |
|
| 4.92 | 10 | 0.3% |
|
| 2.96 | 10 | 0.3% |
|
| 3.49 | 10 | 0.3% |
|
| Other values (1285) | 3432 | 96.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.11 | 3 | 0.1% |
|
| 1.12 | 1 | 0.0% |
|
| 1.13 | 1 | 0.0% |
|
| 1.1349999999999998 | 1 | 0.0% |
|
| 1.14 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 12.46 | 1 | 0.0% |
|
| 12.64 | 1 | 0.0% |
|
| 13.15 | 1 | 0.0% |
|
| 13.24 | 1 | 0.0% |
|
| 13.84 | 1 | 0.0% |
|
Value_SP500_DIV_YIELD_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YIELD_MONTH and should be ignored for analysis
| Correlation | 0.97677 |
|---|
Value_SP500_EARNINGS_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.99925 |
|---|
Value_SP500_EARNINGS_GROWTH_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_REAL_EARNINGS_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.99856 |
|---|
Value_SP500_EARNINGS_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_INFLADJ_YEAR and should be ignored for analysis
| Correlation | 0.9169 |
|---|
Value_SP500_EARNINGS_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_EARNINGS_MONTH and should be ignored for analysis
| Correlation | 0.99519 |
|---|
Value_SP500_EARNINGS_YIELD_MONTH
Numeric
| Distinct count | 1649 |
|---|---|
| Unique (%) | 46.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 7.3645 |
|---|---|
| Minimum | 0.81 |
| Maximum | 18.82 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.81 |
|---|---|
| 5-th percentile | 3.8938 |
| Q1 | 5.55 |
| Median | 6.78 |
| Q3 | 8.755 |
| 95-th percentile | 12.783 |
| Maximum | 18.82 |
| Range | 18.01 |
| Interquartile range | 3.205 |
Descriptive statistics
| Standard deviation | 2.7021 |
|---|---|
| Coef of variation | 0.36691 |
| Kurtosis | 1.1874 |
| Mean | 7.3645 |
| MAD | 2.0858 |
| Skewness | 0.96555 |
| Sum | 26129 |
| Variance | 7.3012 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 5.62 | 15 | 0.4% |
|
| 5.29 | 13 | 0.4% |
|
| 5.54 | 13 | 0.4% |
|
| 5.33 | 12 | 0.3% |
|
| 5.72 | 11 | 0.3% |
|
| 6.23 | 11 | 0.3% |
|
| 5.3 | 11 | 0.3% |
|
| 5.38 | 11 | 0.3% |
|
| 5.48 | 11 | 0.3% |
|
| 5.26 | 11 | 0.3% |
|
| Other values (1639) | 3429 | 96.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.81 | 3 | 0.1% |
|
| 0.8200000000000001 | 1 | 0.0% |
|
| 0.83 | 1 | 0.0% |
|
| 0.87 | 1 | 0.0% |
|
| 0.895 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 17.845 | 1 | 0.0% |
|
| 18.12 | 1 | 0.0% |
|
| 18.48 | 1 | 0.0% |
|
| 18.65 | 1 | 0.0% |
|
| 18.82 | 1 | 0.0% |
|
Value_SP500_EARNINGS_YIELD_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_EARNINGS_YIELD_MONTH and should be ignored for analysis
| Correlation | 0.98185 |
|---|
Value_SP500_INFLADJ_MONTH
Highly correlated
This variable is highly correlated with Value_SP500_REAL_PRICE_MONTH and should be ignored for analysis
| Correlation | 0.9681 |
|---|
Value_SP500_INFLADJ_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_DIV_YEAR and should be ignored for analysis
| Correlation | 0.92035 |
|---|
Value_SP500_PBV_RATIO_QUARTER
Numeric
| Distinct count | 333 |
|---|---|
| Unique (%) | 9.4% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 4.7592 |
|---|---|
| Minimum | 1.78 |
| Maximum | 5.06 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 1.78 |
|---|---|
| 5-th percentile | 2.6767 |
| Q1 | 5.05 |
| Median | 5.05 |
| Q3 | 5.05 |
| 95-th percentile | 5.05 |
| Maximum | 5.06 |
| Range | 3.28 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.79199 |
|---|---|
| Coef of variation | 0.16641 |
| Kurtosis | 4.4417 |
| Mean | 4.7592 |
| MAD | 0.50877 |
| Skewness | -2.475 |
| Sum | 16886 |
| Variance | 0.62724 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 5.05 | 3096 | 87.3% |
|
| 2.76 | 9 | 0.3% |
|
| 2.91 | 8 | 0.2% |
|
| 2.74 | 7 | 0.2% |
|
| 2.73 | 5 | 0.1% |
|
| 2.19 | 4 | 0.1% |
|
| 2.63 | 4 | 0.1% |
|
| 2.58 | 4 | 0.1% |
|
| 2.81 | 4 | 0.1% |
|
| 2.763333333333333 | 4 | 0.1% |
|
| Other values (323) | 403 | 11.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 1.78 | 1 | 0.0% |
|
| 1.7983333333333333 | 1 | 0.0% |
|
| 1.8166666666666667 | 2 | 0.1% |
|
| 1.835 | 1 | 0.0% |
|
| 1.85 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 5.053333333333333 | 1 | 0.0% |
|
| 5.055 | 1 | 0.0% |
|
| 5.056666666666667 | 1 | 0.0% |
|
| 5.058333333333333 | 1 | 0.0% |
|
| 5.06 | 1 | 0.0% |
|
Value_SP500_PBV_RATIO_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PBV_RATIO_QUARTER and should be ignored for analysis
| Correlation | 0.99857 |
|---|
Value_SP500_PE_RATIO_MONTH
Numeric
| Distinct count | 2218 |
|---|---|
| Unique (%) | 62.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.731 |
|---|---|
| Minimum | 5.31 |
| Maximum | 123.73 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.31 |
|---|---|
| 5-th percentile | 7.8202 |
| Q1 | 11.429 |
| Median | 14.75 |
| Q3 | 18.03 |
| 95-th percentile | 25.691 |
| Maximum | 123.73 |
| Range | 118.42 |
| Interquartile range | 6.6013 |
Descriptive statistics
| Standard deviation | 8.3823 |
|---|---|
| Coef of variation | 0.53285 |
| Kurtosis | 67.083 |
| Mean | 15.731 |
| MAD | 4.6349 |
| Skewness | 6.4167 |
| Sum | 55814 |
| Variance | 70.263 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 15.61 | 8 | 0.2% |
|
| 12.21 | 7 | 0.2% |
|
| 9.84 | 7 | 0.2% |
|
| 10.34 | 7 | 0.2% |
|
| 14.51 | 6 | 0.2% |
|
| 11.48 | 6 | 0.2% |
|
| 14.75 | 6 | 0.2% |
|
| 10.98 | 6 | 0.2% |
|
| 19.0 | 6 | 0.2% |
|
| 15.11 | 5 | 0.1% |
|
| Other values (2208) | 3484 | 98.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.31 | 1 | 0.0% |
|
| 5.359999999999999 | 1 | 0.0% |
|
| 5.41 | 1 | 0.0% |
|
| 5.525 | 1 | 0.0% |
|
| 5.609999999999999 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 119.85 | 1 | 0.0% |
|
| 121.78999999999999 | 1 | 0.0% |
|
| 123.32 | 1 | 0.0% |
|
| 123.525 | 1 | 0.0% |
|
| 123.73 | 1 | 0.0% |
|
Value_SP500_PE_RATIO_YEAR
Numeric
| Distinct count | 3346 |
|---|---|
| Unique (%) | 94.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 15.741 |
|---|---|
| Minimum | 5.74 |
| Maximum | 70.91 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 5.74 |
|---|---|
| 5-th percentile | 8.0235 |
| Q1 | 11.653 |
| Median | 15.053 |
| Q3 | 18.011 |
| 95-th percentile | 25.639 |
| Maximum | 70.91 |
| Range | 65.17 |
| Interquartile range | 6.3581 |
Descriptive statistics
| Standard deviation | 6.6443 |
|---|---|
| Coef of variation | 0.42211 |
| Kurtosis | 14.325 |
| Mean | 15.741 |
| MAD | 4.3673 |
| Skewness | 2.7959 |
| Sum | 55848 |
| Variance | 44.147 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 18.08 | 3 | 0.1% |
|
| 17.496666666666666 | 3 | 0.1% |
|
| 15.59 | 3 | 0.1% |
|
| 11.82 | 3 | 0.1% |
|
| 14.835 | 3 | 0.1% |
|
| 17.45 | 3 | 0.1% |
|
| 16.85 | 3 | 0.1% |
|
| 17.21666666666667 | 3 | 0.1% |
|
| 16.3 | 3 | 0.1% |
|
| 12.95125 | 3 | 0.1% |
|
| Other values (3336) | 3518 | 99.2% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 5.74 | 1 | 0.0% |
|
| 5.765000000000001 | 1 | 0.0% |
|
| 5.79 | 1 | 0.0% |
|
| 5.815 | 1 | 0.0% |
|
| 5.832916666666667 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 66.72583333333333 | 1 | 0.0% |
|
| 66.78916666666666 | 1 | 0.0% |
|
| 68.81791666666666 | 1 | 0.0% |
|
| 68.84958333333333 | 1 | 0.0% |
|
| 70.91 | 1 | 0.0% |
|
Value_SP500_PSR_QUARTER
Numeric
| Distinct count | 300 |
|---|---|
| Unique (%) | 8.5% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 1.738 |
|---|---|
| Minimum | 0.8 |
| Maximum | 2.31 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 0.8 |
|---|---|
| 5-th percentile | 1.4339 |
| Q1 | 1.77 |
| Median | 1.77 |
| Q3 | 1.77 |
| 95-th percentile | 1.77 |
| Maximum | 2.31 |
| Range | 1.51 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 0.13545 |
|---|---|
| Coef of variation | 0.077933 |
| Kurtosis | 13.779 |
| Mean | 1.738 |
| MAD | 0.067544 |
| Skewness | -3.163 |
| Sum | 6166.4 |
| Variance | 0.018346 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 1.77 | 3122 | 88.0% |
|
| 1.44 | 10 | 0.3% |
|
| 1.66 | 7 | 0.2% |
|
| 1.52 | 7 | 0.2% |
|
| 2.1 | 7 | 0.2% |
|
| 1.43 | 6 | 0.2% |
|
| 1.46 | 6 | 0.2% |
|
| 1.3166666666666667 | 5 | 0.1% |
|
| 1.47 | 4 | 0.1% |
|
| 1.33 | 4 | 0.1% |
|
| Other values (290) | 370 | 10.4% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 0.8 | 1 | 0.0% |
|
| 0.8116666666666668 | 1 | 0.0% |
|
| 0.8233333333333334 | 1 | 0.0% |
|
| 0.8283333333333334 | 1 | 0.0% |
|
| 0.835 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2.19 | 1 | 0.0% |
|
| 2.205 | 1 | 0.0% |
|
| 2.25 | 1 | 0.0% |
|
| 2.2575000000000003 | 1 | 0.0% |
|
| 2.31 | 1 | 0.0% |
|
Value_SP500_PSR_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_PSR_QUARTER and should be ignored for analysis
| Correlation | 0.98853 |
|---|
Value_SP500_REAL_EARNINGS_GROWTH_QUARTER
Numeric
| Distinct count | 675 |
|---|---|
| Unique (%) | 19.0% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -1.6836 |
|---|---|
| Minimum | -90.27 |
| Maximum | 905.56 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -90.27 |
|---|---|
| 5-th percentile | -7.94 |
| Q1 | -7.94 |
| Median | -7.94 |
| Q3 | -7.94 |
| 95-th percentile | 16.01 |
| Maximum | 905.56 |
| Range | 995.83 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 56.051 |
|---|---|
| Coef of variation | -33.292 |
| Kurtosis | 184.12 |
| Mean | -1.6836 |
| MAD | 13.005 |
| Skewness | 12.982 |
| Sum | -5973.4 |
| Variance | 3141.7 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -7.94 | 2856 | 80.5% |
|
| 14.93 | 9 | 0.3% |
|
| 11.38 | 7 | 0.2% |
|
| -0.54 | 2 | 0.1% |
|
| -12.535 | 2 | 0.1% |
|
| -13.59 | 2 | 0.1% |
|
| 14.05 | 2 | 0.1% |
|
| 13.870000000000001 | 1 | 0.0% |
|
| -14.786666666666667 | 1 | 0.0% |
|
| -17.93999999999999 | 1 | 0.0% |
|
| Other values (665) | 665 | 18.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -90.27 | 1 | 0.0% |
|
| -89.69166666666666 | 1 | 0.0% |
|
| -89.11333333333333 | 1 | 0.0% |
|
| -88.535 | 1 | 0.0% |
|
| -88.47166666666666 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 882.12 | 1 | 0.0% |
|
| 887.98 | 1 | 0.0% |
|
| 893.8399999999999 | 1 | 0.0% |
|
| 899.6999999999999 | 1 | 0.0% |
|
| 905.56 | 1 | 0.0% |
|
Value_SP500_REAL_EARNINGS_GROWTH_YEAR
Numeric
| Distinct count | 685 |
|---|---|
| Unique (%) | 19.3% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -3.8153 |
|---|---|
| Minimum | -79.48 |
| Maximum | 261.66 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -79.48 |
|---|---|
| 5-th percentile | -7.94 |
| Q1 | -7.94 |
| Median | -7.94 |
| Q3 | -7.94 |
| 95-th percentile | 16.192 |
| Maximum | 261.66 |
| Range | 341.14 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 20.964 |
|---|---|
| Coef of variation | -5.4947 |
| Kurtosis | 69.189 |
| Mean | -3.8153 |
| MAD | 8.4864 |
| Skewness | 7.1384 |
| Sum | -13537 |
| Variance | 439.48 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -7.94 | 2856 | 80.5% |
|
| 14.93 | 9 | 0.3% |
|
| -15.020833333333332 | 1 | 0.0% |
|
| 4.810000000000002 | 1 | 0.0% |
|
| 12.56375 | 1 | 0.0% |
|
| -28.35375 | 1 | 0.0% |
|
| 22.085 | 1 | 0.0% |
|
| 34.3975 | 1 | 0.0% |
|
| -0.12124999999999997 | 1 | 0.0% |
|
| 7.221666666666667 | 1 | 0.0% |
|
| Other values (675) | 675 | 19.0% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -79.48 | 1 | 0.0% |
|
| -77.04541666666668 | 1 | 0.0% |
|
| -74.61083333333335 | 1 | 0.0% |
|
| -72.17625000000001 | 1 | 0.0% |
|
| -69.74166666666667 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 235.16750000000002 | 1 | 0.0% |
|
| 243.99833333333336 | 1 | 0.0% |
|
| 247.44583333333338 | 1 | 0.0% |
|
| 252.82916666666668 | 1 | 0.0% |
|
| 261.66 | 1 | 0.0% |
|
Value_SP500_REAL_PRICE_MONTH
Numeric
| Distinct count | 2827 |
|---|---|
| Unique (%) | 79.7% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 267.14 |
|---|---|
| Minimum | 2.73 |
| Maximum | 2789.8 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 2.73 |
|---|---|
| 5-th percentile | 4.4 |
| Q1 | 7.8138 |
| Median | 16.555 |
| Q3 | 128.36 |
| 95-th percentile | 1428.9 |
| Maximum | 2789.8 |
| Range | 2787.1 |
| Interquartile range | 120.55 |
Descriptive statistics
| Standard deviation | 533.7 |
|---|---|
| Coef of variation | 1.9978 |
| Kurtosis | 5.5084 |
| Mean | 267.14 |
| MAD | 366.21 |
| Skewness | 2.4362 |
| Sum | 947810 |
| Variance | 284830 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 7.68 | 11 | 0.3% |
|
| 5.18 | 9 | 0.3% |
|
| 4.37 | 9 | 0.3% |
|
| 4.46 | 9 | 0.3% |
|
| 2736.61 | 8 | 0.2% |
|
| 5.25 | 8 | 0.2% |
|
| 9.3 | 7 | 0.2% |
|
| 4.54 | 7 | 0.2% |
|
| 8.83 | 6 | 0.2% |
|
| 5.33 | 6 | 0.2% |
|
| Other values (2817) | 3468 | 97.7% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 2.73 | 1 | 0.0% |
|
| 2.79 | 1 | 0.0% |
|
| 2.835 | 1 | 0.0% |
|
| 2.85 | 1 | 0.0% |
|
| 2.94 | 3 | 0.1% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 2736.61 | 8 | 0.2% |
|
| 2745.48 | 1 | 0.0% |
|
| 2747.48 | 1 | 0.0% |
|
| 2754.35 | 1 | 0.0% |
|
| 2789.8 | 1 | 0.0% |
|
Value_SP500_REAL_SALES_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_SALES_GROWTH_YEAR and should be ignored for analysis
| Correlation | 0.9797 |
|---|
Value_SP500_REAL_SALES_GROWTH_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.96667 |
|---|
Value_SP500_REAL_SALES_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_YEAR and should be ignored for analysis
| Correlation | 0.99537 |
|---|
Value_SP500_REAL_SALES_YEAR
Highly correlated
This variable is highly correlated with Value_SP500_SALES_QUARTER and should be ignored for analysis
| Correlation | 0.90087 |
|---|
Value_SP500_SALES_GROWTH_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_REAL_SALES_GROWTH_QUARTER and should be ignored for analysis
| Correlation | 0.99017 |
|---|
Value_SP500_SALES_GROWTH_YEAR
Numeric
| Distinct count | 394 |
|---|---|
| Unique (%) | 11.1% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | -0.64806 |
|---|---|
| Minimum | -12.86 |
| Maximum | 10.93 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | -12.86 |
|---|---|
| 5-th percentile | -1.18 |
| Q1 | -1.18 |
| Median | -1.18 |
| Q3 | -1.18 |
| 95-th percentile | 5.1897 |
| Maximum | 10.93 |
| Range | 23.79 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 2.4013 |
|---|---|
| Coef of variation | -3.7053 |
| Kurtosis | 11.202 |
| Mean | -0.64806 |
| MAD | 1.1652 |
| Skewness | 2.698 |
| Sum | -2299.3 |
| Variance | 5.766 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| -1.18 | 3144 | 88.6% |
|
| 9.03 | 9 | 0.3% |
|
| 3.92 | 2 | 0.1% |
|
| 2.62 | 2 | 0.1% |
|
| 3.68 | 2 | 0.1% |
|
| 8.94 | 1 | 0.0% |
|
| 6.684166666666667 | 1 | 0.0% |
|
| 10.266666666666666 | 1 | 0.0% |
|
| 1.44 | 1 | 0.0% |
|
| -2.6916666666666664 | 1 | 0.0% |
|
| Other values (384) | 384 | 10.8% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| -12.86 | 1 | 0.0% |
|
| -12.253333333333332 | 1 | 0.0% |
|
| -12.075 | 1 | 0.0% |
|
| -11.646666666666665 | 1 | 0.0% |
|
| -11.29 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 10.921666666666667 | 1 | 0.0% |
|
| 10.92375 | 1 | 0.0% |
|
| 10.925833333333333 | 1 | 0.0% |
|
| 10.927916666666667 | 1 | 0.0% |
|
| 10.93 | 1 | 0.0% |
|
Value_SP500_SALES_QUARTER
Highly correlated
This variable is highly correlated with Value_SP500_SALES_YEAR and should be ignored for analysis
| Correlation | 0.99943 |
|---|
Value_SP500_SALES_YEAR
Numeric
| Distinct count | 421 |
|---|---|
| Unique (%) | 11.9% |
| Missing (%) | 0.0% |
| Missing (n) | 0 |
| Infinite (%) | 0.0% |
| Infinite (n) | 0 |
| Mean | 773.69 |
|---|---|
| Minimum | 674.59 |
| Maximum | 1292.8 |
| Zeros (%) | 0.0% |
Quantile statistics
| Minimum | 674.59 |
|---|---|
| 5-th percentile | 745.7 |
| Q1 | 745.7 |
| Median | 745.7 |
| Q3 | 745.7 |
| 95-th percentile | 1036.8 |
| Maximum | 1292.8 |
| Range | 618.25 |
| Interquartile range | 0 |
Descriptive statistics
| Standard deviation | 96.243 |
|---|---|
| Coef of variation | 0.12439 |
| Kurtosis | 10.629 |
| Mean | 773.69 |
| MAD | 52.026 |
| Skewness | 3.3842 |
| Sum | 2745100 |
| Variance | 9262.8 |
| Memory size | 27.8 KiB |
| Value | Count | Frequency (%) | |
| 745.7 | 3120 | 87.9% |
|
| 1292.84 | 9 | 0.3% |
|
| 1172.2266666666667 | 1 | 0.0% |
|
| 791.7595833333332 | 1 | 0.0% |
|
| 1140.8675 | 1 | 0.0% |
|
| 870.7304166666668 | 1 | 0.0% |
|
| 1240.7733333333333 | 1 | 0.0% |
|
| 1236.1716666666666 | 1 | 0.0% |
|
| 966.465 | 1 | 0.0% |
|
| 775.2766666666666 | 1 | 0.0% |
|
| Other values (411) | 411 | 11.6% |
|
Minimum 5 values
| Value | Count | Frequency (%) | |
| 674.59 | 1 | 0.0% |
|
| 676.0991666666667 | 1 | 0.0% |
|
| 677.1854166666667 | 1 | 0.0% |
|
| 677.6083333333333 | 1 | 0.0% |
|
| 679.1175000000001 | 1 | 0.0% |
|
Maximum 5 values
| Value | Count | Frequency (%) | |
| 1270.4 | 1 | 0.0% |
|
| 1276.01 | 1 | 0.0% |
|
| 1281.62 | 1 | 0.0% |
|
| 1287.23 | 1 | 0.0% |
|
| 1292.84 | 9 | 0.3% |
|
| Value_SP500_REAL_PRICE_MONTH | Value_SP500_DIV_YIELD_MONTH | Value_SP500_PE_RATIO_MONTH | Value_SHILLER_PE_RATIO_MONTH | Value_SP500_EARNINGS_YIELD_MONTH | Value_SP500_INFLADJ_MONTH | Value_SP500_PSR_QUARTER | Value_SP500_DIV_MONTH | Value_SP500_DIV_YEAR | Value_SP500_DIV_GROWTH_YEAR | Value_SP500_DIV_GROWTH_QUARTER | Value_SP500_PBV_RATIO_QUARTER | Value_SHILLER_PE_RATIO_YEAR | Value_SP500_PE_RATIO_YEAR | Value_SP500_DIV_YIELD_YEAR | Value_SP500_PSR_YEAR | Value_SP500_EARNINGS_YIELD_YEAR | Value_SP500_PBV_RATIO_YEAR | Value_SP500_INFLADJ_YEAR | Value_SP500_SALES_YEAR | Value_SP500_SALES_GROWTH_YEAR | Value_SP500_SALES_QUARTER | Value_SP500_REAL_SALES_GROWTH_QUARTER | Value_SP500_SALES_GROWTH_QUARTER | Value_SP500_REAL_SALES_GROWTH_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_YEAR | Value_SP500_REAL_SALES_YEAR | Value_SP500_REAL_EARNINGS_GROWTH_QUARTER | Value_SP500_EARNINGS_GROWTH_QUARTER | Value_SP500_REAL_SALES_QUARTER | Value_SP500_EARNINGS_MONTH | Value_SP500_BVPS_YEAR | Value_SP500_EARNINGS_YEAR | Value_SP500_EARNINGS_GROWTH_YEAR | Value_SP500_BVPS_QUARTER | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||||||||||||||||
| 1871-01-01 | 4.440 | 5.86 | 11.100 | 10.920 | 9.010 | 89.81 | 1.77 | 5.260 | 5.150000 | 13.38 | 13.38 | 5.05 | 11.9 | 11.100000 | 5.860000 | 1.77 | 9.010000 | 5.05 | 89.8100 | 745.7 | -1.18 | 745.7 | -3.66 | -1.18 | -3.66 | -7.94 | 1087.39 | -7.94 | -3.71 | 1087.39 | 8.090 | 290.68 | 7.920000 | -3.71 | 290.68 |
| 1871-01-31 | 4.470 | 5.86 | 11.175 | 10.920 | 8.950 | 89.07 | 1.77 | 5.260 | 5.150000 | 13.38 | 13.38 | 5.05 | 11.9 | 11.140417 | 5.860000 | 1.77 | 8.979583 | 5.05 | 90.1025 | 745.7 | -1.18 | 745.7 | -3.66 | -1.18 | -3.66 | -7.94 | 1087.39 | -7.94 | -3.71 | 1087.39 | 8.090 | 290.68 | 7.920000 | -3.71 | 290.68 |
| 1871-02-01 | 4.500 | 5.82 | 11.250 | 10.920 | 8.890 | 88.33 | 1.77 | 5.180 | 5.151364 | 13.38 | 13.38 | 5.05 | 11.9 | 11.180833 | 5.843182 | 1.77 | 8.949167 | 5.05 | 90.3950 | 745.7 | -1.18 | 745.7 | -3.66 | -1.18 | -3.66 | -7.94 | 1087.39 | -7.94 | -3.71 | 1087.39 | 7.970 | 290.68 | 7.922273 | -3.71 | 290.68 |
| 1871-02-28 | 4.555 | 5.78 | 11.385 | 11.055 | 8.785 | 88.75 | 1.77 | 5.100 | 5.152727 | 13.38 | 13.38 | 5.05 | 11.9 | 11.221250 | 5.826364 | 1.77 | 8.918750 | 5.05 | 90.6875 | 745.7 | -1.18 | 745.7 | -3.66 | -1.18 | -3.66 | -7.94 | 1087.39 | -7.94 | -3.71 | 1087.39 | 7.850 | 290.68 | 7.924545 | -3.71 | 290.68 |
| 1871-03-01 | 4.610 | 5.71 | 11.520 | 11.190 | 8.680 | 89.17 | 1.77 | 5.065 | 5.154091 | 13.38 | 13.38 | 5.05 | 11.9 | 11.261667 | 5.809545 | 1.77 | 8.888333 | 5.05 | 90.9800 | 745.7 | -1.18 | 745.7 | -3.66 | -1.18 | -3.66 | -7.94 | 1087.39 | -7.94 | -3.71 | 1087.39 | 7.795 | 290.68 | 7.926818 | -3.71 | 290.68 |
autocorrelation_plot(df_imputed)
plt.show()
autocorrelation_plot(df_interpolate)
plt.show()
#Corelation plot after imputation
plt.figure(figsize=(20,15))
sns.heatmap(df_imputed.corr(),annot=True,fmt='.2f',square=False)
<matplotlib.axes._subplots.AxesSubplot at 0x23b2c873b00>
#Correlation heatmap plot after interpolation
plt.figure(figsize=(20,15))
sns.heatmap(df_interpolate.corr(),annot=True,fmt='.2f',square=False)
<matplotlib.axes._subplots.AxesSubplot at 0x23b2aa86400>
#Check for missing values per column and create a graph
Column_missing_values(df_imputed)
Column_missing_values(df_interpolate)
No missing values in provided dataframe No missing values in provided dataframe
df_imputed.plot(figsize=(50,30),fontsize=30)
<matplotlib.axes._subplots.AxesSubplot at 0x23b2cda4a20>
df_interpolate.plot(figsize=(30,20),fontsize=20)
<matplotlib.axes._subplots.AxesSubplot at 0x23b2d3dd240>
df.hist(figsize=(15,10))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D558B00>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D58B4E0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D5B2DD8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D5DF860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D60D320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D60D358>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D662860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D694320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D6B7DA0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D6E6860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D715320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D739DA0>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D76A860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D79A320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D7BFDA0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D7EE860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D81D320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D843DA0>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D872860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D8A3320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D8C5DA0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D8F5860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D924320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D94BDA0>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D978860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D9A8320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2EB00DA0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2EB2E860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2EB5D320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30131DA0>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30161860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30192320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B301B8DA0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B301E5860>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30214320>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3023ADA0>]],
dtype=object)
df_imputed.hist(figsize=(30,20))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B302BD080>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B2D4829E8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B304DE080>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B305D30B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B305F2B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B305F2B70>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3063E0B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30663B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B306935F8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B306C30B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B306E8B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307195F8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307460B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3076AB38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307985F8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307CB0B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B307EFB38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3081F5F8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3084F0B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30873B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B308A25F8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B308D30B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B308F8B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309265F8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309560B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3097CB38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309AB5F8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309DB0B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B309FFB38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30A2F5F8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30A5E0B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30A83B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30AB35F8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30AE40B8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30B08B38>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B30B375F8>]],
dtype=object)
#Histogram plot of all variables in interpolated dataset
df_interpolate.hist(figsize=(30,20))
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3133FBA8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B331B7278>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3312C3C8>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B33226BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B332476A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B332476D8>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3169EBE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B316CD6A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B316FD160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31721BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B317516A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31781160>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B317A6BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B317D46A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31803160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3182ABE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B318586A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31888160>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B318ACBE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B318DE6A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3190C160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31932BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B319616A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B3198F160>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B319B4BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B319E46A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A12160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A39BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A686A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31A99160>],
[<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31ABEBE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31AED6A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31B1F160>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31B41BE0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31B726A0>,
<matplotlib.axes._subplots.AxesSubplot object at 0x0000023B31BA1160>]],
dtype=object)
linear Interpolation on missing values looks promising and there is better correlation between variables of the dataset. Imputation has not shown better correlation between variables. I would use interpolation dataframe for further analysis.
save df, df_imputed and df_iterpolated to csv file.
#Write dfs to csv file.
df.to_csv('SandP500_Index_Master.csv',index=False)
df_imputed.to_csv('SandP500_Index_df_imputed.csv',index=False)
df_interpolate.to_csv('SandP500_Index_df_Interpolated.csv',index=False)
Apply StandardScaler() to the imputed df and interpolated df to normalize feature values.
# Apply feature scaling on all values to the entire numerical dataframe.define function
def Apply_Standard_Scaler(df):
'''
This funtion applies StandardScaler() to columns/features of a given dataframe.
IN- a pandas dataframe
OUT - df_scaled_features dataframe of scaled features.
scaler - StandardScaler() object.
'''
scaler = StandardScaler()
scaled_features = StandardScaler().fit_transform(df.values)
df_scaled_features = pd.DataFrame(scaled_features,index=df.index,columns = df.columns)
#df_scaled_features.describe()
return df_scaled_features, scaler
print ('Number of columns present in imputed and interpolated datasets are : {} & {}'.format (
len(df_imputed.columns),len(df_interpolate.columns)))
Number of columns present in imputed and interpolated datasets are : 35 & 35
# Apply standard scaling function to both dfs and return scaled df.
df_scaled_features_imputed, scaler_df_imputed = Apply_Standard_Scaler(df_imputed)
df_scaled_features_interpolation, scaler_df_interpolate = Apply_Standard_Scaler(df_interpolate)
Apply PCA feature extraction analysis to find groups of features with highest and lowest variance.
#function to apply PCA feature scaling
def scree_plot(pca):
'''
Creates a scree plot associated with the principal components
INPUT: pca - the result of instance of PCA in scikit learn
OUTPUT:
None
'''
num_components = len(pca.explained_variance_ratio_)
ind = np.arange(num_components)
vals = pca.explained_variance_ratio_
plt.figure(figsize=(25, 10))
ax = plt.subplot(111)
cumvals = np.cumsum(vals)
ax.bar(ind, vals)
ax.plot(ind, cumvals)
#print (ind, cumvals)
for i in range(num_components):
ax.annotate(r"%s%%" % ((str(vals[i]*100)[:4])), (ind[i]+0.2, vals[i]), va="bottom", ha="center", fontsize=12)
ax.xaxis.set_tick_params(width=0)
ax.yaxis.set_tick_params(width=2, length=12)
ax.set_xlabel("Principal Component")
ax.set_ylabel("Variance Explained (%)")
plt.title('Explained Variance Per Principal Component')
# Apply PCA to the data for all features
def Apply_PCA(df_scaled_features,n_components):
'''
This function would create a PCA object in scikit-learn with n_components and apply fit_transform function of PCA on scaled features df.
This function internally calls another 'scree_plot' function to create a chart of feature variance.
IN -
df_scaled_features - numerically scaled dataframe.
n_components - n number of components for PCA feature analysis
Output- Returns PCA object with provided n_components
'''
pca = PCA(n_components)
pca_scaled_features = pca.fit_transform(df_scaled_features)
scree_plot(pca)
return pca
# Map weights for the first principal component to corresponding feature names
# and then print the linked values, sorted by weight.
def sorted_weights(pca, ix, dataset):
"""
Docstring- map the weights and components from PCA analysis.
Input parameters-
Input - pca initialized model
ix = index number of first set of components.
dataset = a dataframe of scaled features
Output- A Tuple of features with variance.
Prints a bar chart with feature names and related variance.
"""
a1 = pca.components_[ix]
a2 = dataset.keys().values
a = list(zip(a1, a2))
a.sort(key=lambda tup: tup[0])
x_labels = [val[0] for val in a]
y_labels = [val[1] for val in a]
plt.Figure(figsize=(20, 15))
ax = pd.Series(x_labels).plot(kind='bar')
ax.set_xticklabels(y_labels)
rects = ax.patches
# for rect, label in zip(rects, x_labels):
# height = rect.get_height()
# ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom')
return a
#Cluster of of features with highest variance
def Print_PCAfeatures_graph(df,a,n):
'''
Function to print PCA features in line chart with top n lowest variance or top n highest variance for a dataframe.
IN-
df- dataframe on which PCA analysis was done.
a - result from 'sorted_weights' function. A sequence of 2-d array with sorted weights of features.
n- number of features required to be printed in the chart. Negative (-n) shall show features with positive variance while
positive (n) would show features with negative variance.
OUT- name of features from cluster a.
'''
groups = []
for i in range(len(a)):
groups.append(a[i][1])
#Slice 1-D array appropriately
if n < 0:
groups = groups[n:]
else:
groups = groups[:n]
#print(groups)
i = 1
for group in groups:
plt.subplot(len(groups), 1, i)
plt.plot(df[group].values)
plt.title(group, y=0.5, loc='center')
i += 1
plt.figure(figsize=(30,20))
plt.show()
return groups
Visualize PCA feature extraction and variance for imputed and interpolated dfs
#Apply PCA for all features except target output.
pca_imputed = Apply_PCA(df_scaled_features_imputed.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1),n_components=10)
#Apply PCA for all features except target output.
pca_interpolated = Apply_PCA(df_scaled_features_interpolation.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1),n_components=10)
Visualize weights of features for first group of extracted features with highest variance
# #Cluster of of features with highest variance
# def Print_PCAfeatures_graph(df,a,n):
# '''
# Function to print PCA features in line chart with top n lowest variance or top n highest variance for a dataframe.
# IN-
# df- dataframe on which PCA analysis was done.
# a - result from 'sorted_weights' function. A sequence of 2-d array with sorted weights of features.
# n- number of features required to be printed in the chart. Negative (-n) shall show features with maximum variance while
# positive n would show features with lowest variance.
# OUT- name of features from cluster a.
# '''
# groups = []
# for i in range(len(a)):
# groups.append(a[i][1])
# #Slice 1-D array appropriately
# if n < 0:
# groups = groups[n:]
# else:
# groups = groups[:n]
# #print(groups)
# i = 1
# for group in groups:
# plt.subplot(len(groups), 1, i)
# plt.plot(df[group].values)
# plt.title(group, y=0.5, loc='center')
# i += 1
# plt.figure(figsize=(30,20))
# plt.show()
# return groups
#List cluster of features with highest variance from PCA analysis
a = sorted_weights(pca_imputed,1,df_scaled_features_imputed)
#print chart with top 5 features with maximum variance
Print_PCAfeatures_graph(df_imputed,a,-5)
<Figure size 3000x2000 with 0 Axes>
['Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_PE_RATIO_MONTH', 'Value_SP500_PBV_RATIO_YEAR', 'Value_SP500_PBV_RATIO_QUARTER']
#print chart with top 5 features with lowest variance
Print_PCAfeatures_graph(df_imputed,a,5)
<Figure size 3000x2000 with 0 Axes>
['Value_SHILLER_PE_RATIO_MONTH', 'Value_SP500_PSR_QUARTER', 'Value_SP500_REAL_SALES_QUARTER', 'Value_SP500_PSR_YEAR', 'Value_SP500_REAL_SALES_GROWTH_QUARTER']
b = sorted_weights(pca_interpolated,1,df_scaled_features_interpolation)
b
[(-0.3099270964617886, 'Value_SP500_DIV_GROWTH_YEAR'), (-0.30472102931433814, 'Value_SP500_DIV_YEAR'), (-0.24703832789002747, 'Value_SP500_REAL_SALES_GROWTH_QUARTER'), (-0.23874586737089795, 'Value_SP500_SALES_YEAR'), (-0.2383254455928687, 'Value_SP500_SALES_QUARTER'), (-0.22862267710171758, 'Value_SP500_SALES_GROWTH_QUARTER'), (-0.2276975766969936, 'Value_SP500_EARNINGS_GROWTH_QUARTER'), (-0.2205363310620414, 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR'), (-0.18570532863264552, 'Value_SHILLER_PE_RATIO_MONTH'), (-0.18030144089471137, 'Value_SP500_PSR_YEAR'), (-0.14675080518992814, 'Value_SP500_SALES_GROWTH_YEAR'), (-0.14261472452590546, 'Value_SP500_INFLADJ_YEAR'), (-0.13310173004335377, 'Value_SP500_PE_RATIO_YEAR'), (-0.13165915021671745, 'Value_SP500_REAL_PRICE_MONTH'), (-0.1275332154522852, 'Value_SP500_INFLADJ_MONTH'), (-0.11806250612104319, 'Value_SP500_EARNINGS_GROWTH_YEAR'), (-0.1177629782635305, 'Value_SP500_EARNINGS_MONTH'), (-0.11682443659890858, 'Value_SP500_DIV_YIELD_YEAR'), (-0.06950409932183431, 'Value_SP500_REAL_SALES_QUARTER'), (-0.0560163591773991, 'Value_SP500_BVPS_YEAR'), (-0.027145876828283743, 'Value_SP500_PSR_QUARTER'), (-0.02606010953354377, 'Value_SP500_DIV_MONTH'), (-0.007376927107057215, 'Value_SP500_DIV_GROWTH_QUARTER'), (-0.002314480823033611, 'Value_SP500_EARNINGS_YIELD_YEAR'), (0.022568969230881045, 'Value_SP500_EARNINGS_YIELD_MONTH'), (0.025265445913567188, 'Value_SP500_PBV_RATIO_YEAR'), (0.12553339929260882, 'Value_SP500_PE_RATIO_MONTH'), (0.12877894460506153, 'Value_SP500_PBV_RATIO_QUARTER'), (0.1652084917363736, 'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER'), (0.1688368159634833, 'Value_SP500_REAL_SALES_YEAR'), (0.20153174127326504, 'Value_SHILLER_PE_RATIO_YEAR'), (0.21577231387568555, 'Value_SP500_EARNINGS_YEAR'), (0.2169762289480813, 'Value_SP500_DIV_YIELD_MONTH'), (0.22110502456487266, 'Value_SP500_REAL_SALES_GROWTH_YEAR')]
#print chart with top 5 features with maximum variance
Print_PCAfeatures_graph(df_interpolate,b,-6)
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_YEAR', 'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_EARNINGS_YEAR', 'Value_SP500_DIV_YIELD_MONTH', 'Value_SP500_REAL_SALES_GROWTH_YEAR']
Print_PCAfeatures_graph(df_interpolate,b,8)
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_DIV_GROWTH_YEAR', 'Value_SP500_DIV_YEAR', 'Value_SP500_REAL_SALES_GROWTH_QUARTER', 'Value_SP500_SALES_YEAR', 'Value_SP500_SALES_QUARTER', 'Value_SP500_SALES_GROWTH_QUARTER', 'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR']
Split 80% of dataset into train and 20% into test based on chronological order because its a timeseries. Avoid look-ahead bias by doing this.
def Create_Training_Test_Dataset(df,split_percent,Linear_regr):
'''
This function would split, slice and create training and test datasets. Provide 'Value_SP500_REAL_PRICE_MONTH' in the input
df. It would input to X
IN- df- a dataframe from which training and test dataset needs to slice.
split_percent - split percent for training and test dataset.
Linear_regr - A flag to split between dataset for linear regression object or LSTM network. Works for this project only.
OUT- X_train, Y_train, X_test, Y_test
'''
# Split the size into 80% and 20% based on rows.
train_size = int(len(df) * split_percent)
test_size = len(df) - train_size
#print(train_size,test_size)
print('Training and Test dataset is of size {} & {}'.format(train_size,test_size))
#Slice the df into train and test df.
train = df.iloc[0:train_size,:]
test = df.iloc[train_size:len(df),:]
#print(train.shape, test.shape)
#Check for Linear Regression flag from user input. If false it would return dataset for LSTM neural network.
if Linear_regr == 'False':
#Create Training dataset
temp_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
X_train = temp_train.iloc[0:train_size,:]
#X_train.head()
Y_train = train.iloc[0:train_size,:1]
#print(Y_train.head())
#print(X_train.shape,Y_train.shape)
#print(X_train.columns,Y_train.columns)
print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))
#Define Test dataset
temp_test = test.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
X_test = temp_test.iloc[0:test_size,:]
#X_train.head()
Y_test = test.iloc[0:test_size,:1]
#print(Y_train.head())
#print(X_test.shape,Y_test.shape)
#print(X_test.columns,Y_test.columns)
print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
else:
#Train dataset
X_train = train['Value_SP500_REAL_PRICE_MONTH'][0:train_size]
Y_train = train['Value_SP500_REAL_PRICE_MONTH'][0:test_size]
print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))
#Test dataset
X_test = test['Value_SP500_REAL_PRICE_MONTH'][0:train_size]
Y_test = test['Value_SP500_REAL_PRICE_MONTH'][0:test_size]
print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
return X_train, Y_train, X_test, Y_test
def Convert_dataset_nparray(X_train, Y_train, X_test, Y_test):
'''
This function would convert the training and test dataset to np.array.
In- X_train, Y_train, X_test, Y_test
OUT- np.array of X_train, Y_train, X_test, Y_test
'''
#Convert to np array as required for LSTM model.
X_train = np.array(X_train)
Y_train = np.array(Y_train)
#print(X_train.shape,Y_train.shape)
print('Training dataset is converted to np.array with size {} & {}'.format(X_train.shape,Y_train.shape))
#Convert test dataset to np.array.
X_test = np.array(X_test)
Y_test = np.array(Y_test)
#print(X_test.shape,Y_test.shape)
print('Test dataset is converted to np.array with size {} & {}'.format(X_test.shape,Y_test.shape))
return X_train, Y_train, X_test, Y_test
Multivariate time-series prediction
from subprocess import check_output
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import LSTM, CuDNNLSTM , BatchNormalization
import tensorflow as tf
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint
import time
from numpy import newaxis
Using TensorFlow backend.
# #Build Model
# model = Sequential()
# model.add(LSTM(input_dim=1,output_dim=50,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(LSTM(100,return_sequences=False))
# model.add(Dropout(0.2))
# model.add(Dense(output_dim=1))
# model.add(Activation('linear'))
# start = time.time()
# model.compile(loss='mse', optimizer='rmsprop')
# print ('compilation time : ', time.time() - start)
# #Build the model
# model = Sequential()
# model.add(LSTM(256,input_shape=(2837,34)))
# model.add(Dense(1))
# model.compile(optimizer='adam',loss='mse')
# #Reshape data for (Sample,Timestep,Features)
# #X_train = X_train.reshape((X_train.shape[0],X_train.shape[1],1))
# #X_test = X_test.reshape((X_test.shape[0],X_test.shape[1],1))
# #Fit model with history to check for overfitting
# #history = model.fit(X_train,y_train,epochs=300,validation_data=(X_test,y_test),shuffle=False)
#model.fit(X_train,Y_train,batch_size=128,epochs=10,validation_split=0.05)
# model = Sequential([
# Dense(32, input_shape=(2837,34)),
# Activation('relu'),
# Dense(10),
# Activation('softmax'),
# ])
# #Compile
# # For a mean squared error regression problem
# model.compile(optimizer='rmsprop',
# loss='mse')
df_interpolate.columns
Index(['Value_SP500_REAL_PRICE_MONTH', 'Value_SP500_DIV_YIELD_MONTH',
'Value_SP500_PE_RATIO_MONTH', 'Value_SHILLER_PE_RATIO_MONTH',
'Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_INFLADJ_MONTH',
'Value_SP500_PSR_QUARTER', 'Value_SP500_DIV_MONTH',
'Value_SP500_DIV_YEAR', 'Value_SP500_DIV_GROWTH_YEAR',
'Value_SP500_DIV_GROWTH_QUARTER', 'Value_SP500_PBV_RATIO_QUARTER',
'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_PE_RATIO_YEAR',
'Value_SP500_DIV_YIELD_YEAR', 'Value_SP500_PSR_YEAR',
'Value_SP500_EARNINGS_YIELD_YEAR', 'Value_SP500_PBV_RATIO_YEAR',
'Value_SP500_INFLADJ_YEAR', 'Value_SP500_SALES_YEAR',
'Value_SP500_SALES_GROWTH_YEAR', 'Value_SP500_SALES_QUARTER',
'Value_SP500_REAL_SALES_GROWTH_QUARTER',
'Value_SP500_SALES_GROWTH_QUARTER',
'Value_SP500_REAL_SALES_GROWTH_YEAR',
'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SP500_REAL_SALES_YEAR',
'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER',
'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_QUARTER',
'Value_SP500_EARNINGS_MONTH', 'Value_SP500_BVPS_YEAR',
'Value_SP500_EARNINGS_YEAR', 'Value_SP500_EARNINGS_GROWTH_YEAR',
'Value_SP500_BVPS_QUARTER'],
dtype='object')
#Not using PCA features here because the model resulted in huge mse errors. I think monthly features should be used which could
#predict the real price correctly.
df_PCA_features = df_interpolate.loc[:,['Value_SP500_REAL_PRICE_MONTH',
'Value_SP500_DIV_YIELD_MONTH',
'Value_SP500_PE_RATIO_MONTH',
'Value_SHILLER_PE_RATIO_MONTH',
'Value_SP500_EARNINGS_YIELD_MONTH',
'Value_SP500_INFLADJ_MONTH',
'Value_SP500_EARNINGS_MONTH','Value_SP500_PSR_QUARTER','Value_SP500_SALES_QUARTER',
'Value_SP500_REAL_SALES_GROWTH_QUARTER','Value_SP500_REAL_EARNINGS_GROWTH_QUARTER']]
df_PCA_features.shape
(3548, 11)
#apply scaling
scaler_df_imputed = StandardScaler()
scaled_features = scaler_df_imputed.fit_transform(df_PCA_features.values)
df_scaled_features_interpol = pd.DataFrame(scaled_features,index=df_PCA_features.index,columns = df_PCA_features.columns)
#df_scaled_features.describe()
#Use real data
#X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df=df_PCA_features, split_percent=0.95, Linear_regr='False')
#X_train, Y_train, X_test, Y_test = Convert_dataset_nparray(X_train, Y_train, X_test, Y_test)
#Split master dataframe into training and test datasets
X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df=df_PCA_features, split_percent=0.98, Linear_regr='False')
X_train, Y_train, X_test, Y_test = Convert_dataset_nparray(X_train, Y_train, X_test, Y_test)
Training and Test dataset is of size 3477 & 71 Features size of X_train and training target Y_train shape is (3477, 10) & (3477, 1) Features size of X_test and Test target Y_test shape is (71, 10) & (71, 1) Training dataset is converted to np.array with size (3477, 10) & (3477, 1) Test dataset is converted to np.array with size (71, 10) & (71, 1)
#Shape training data.
#the inputs (X) are reshaped into the 3D format expected by LSTM, namely [samples, timesteps, features]
X_train = X_train.reshape((X_train.shape[0],1,10))
#Y_train = Y_train.reshape((Y_train.shape[0],1,1)) #Do not reshape it.
print(X_train.shape,Y_train.shape)
print(X_train.shape[1],X_train.shape[2])
#Shape test dataset correctly for LSTM predict.
X_test = X_test.reshape((X_test.shape[0],1,10))
#Y_test = Y_test.reshape((Y_test.shape[0],Y_test.shape[1],1))
print(X_test.shape,Y_test.shape)
print('X_test input shape : ', X_test.shape[0],X_test.shape[1],X_test.shape[2])
(3477, 1, 10) (3477, 1) 1 10 (71, 1, 10) (71, 1) X_test input shape : 71 1 10
# #Design network for mean squared error regression problem
# model = Sequential()
# model.add(LSTM(512, input_shape=(X_train.shape[1], X_train.shape[2])))
# model.add(Dropout(0.2))
# Activation('relu')
# model.add(Dense(1))
# Activation('softmax')
# # Compiling the model using mean square error loss, and Adam optimizer.
# model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
# #model.compile(loss='mse',optimizer='rmsprop',metrics=['accuracy'])
# max_features = 1024
# model = Sequential()
# model.add(Embedding(max_features, output_dim=256))
# model.add(LSTM(128))
# model.add(Dropout(0.5))
# model.add(Dense(1, activation='sigmoid'))
# model.compile(loss='binary_crossentropy',
# optimizer='rmsprop',
# metrics=['accuracy'])
#Design network for mean squared error regression problem
model = Sequential()
#The first dimension is supposed to be each sample.input should be (n_samples, timesteps, n_features)
model.add(LSTM(512, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(BatchNormalization())
# model.add(LSTM(512, input_shape=(X_train.shape[0],X_train.shape[2]),return_sequences=True))
# model.add(Dropout(0.1))
# model.add(BatchNormalization())
Activation('relu')
model.add(Dense(1))
Activation('softmax')
# Compiling the model using mean square error loss, and Adam optimizer.
model.compile(loss='mse', optimizer='adam',metrics=['accuracy'])
#model.compile(loss='mse',optimizer='rmsprop',metrics=['accuracy'])
filepath = "RNN_final-{epoch:02d}-{val_acc:.3f}"#unique file name which will include epochs and validation accuracy score.
checkpoint = ModelCheckpoint("models/{}.model".format(filepath,monitor = 'val_acc',verbose = 1))
print(model.summary)
<bound method Network.summary of <keras.engine.sequential.Sequential object at 0x0000023B40B28C88>>
# fit network with epochs
history = model.fit(X_train, Y_train, epochs=200, batch_size=100, validation_data=(X_train, Y_train), verbose=2,
callbacks = [checkpoint],
shuffle=False)
model.save('LSTM_model',overwrite=True,include_optimizer=True)
Train on 3477 samples, validate on 3477 samples Epoch 1/200 - 5s - loss: 244238.9797 - acc: 0.0000e+00 - val_loss: 242630.7920 - val_acc: 0.0000e+00 Epoch 2/200 - 2s - loss: 243637.9757 - acc: 0.0000e+00 - val_loss: 242623.0742 - val_acc: 0.0000e+00 Epoch 3/200 - 2s - loss: 242933.8914 - acc: 2.8760e-04 - val_loss: 242195.8168 - val_acc: 0.0000e+00 Epoch 4/200 - 2s - loss: 242000.9474 - acc: 2.8760e-04 - val_loss: 241721.6815 - val_acc: 0.0000e+00 Epoch 5/200 - 2s - loss: 240848.0547 - acc: 0.0000e+00 - val_loss: 240140.1107 - val_acc: 2.8760e-04 Epoch 6/200 - 2s - loss: 239475.0473 - acc: 0.0000e+00 - val_loss: 240018.5771 - val_acc: 0.0000e+00 Epoch 7/200 - 2s - loss: 237921.6260 - acc: 0.0000e+00 - val_loss: 239362.4484 - val_acc: 2.8760e-04 Epoch 8/200 - 2s - loss: 236189.1215 - acc: 2.8760e-04 - val_loss: 238873.2472 - val_acc: 0.0000e+00 Epoch 9/200 - 2s - loss: 234283.8113 - acc: 0.0000e+00 - val_loss: 236238.2314 - val_acc: 0.0000e+00 Epoch 10/200 - 2s - loss: 232263.7305 - acc: 0.0000e+00 - val_loss: 234679.4462 - val_acc: 2.8760e-04 Epoch 11/200 - 2s - loss: 230159.0793 - acc: 0.0000e+00 - val_loss: 231780.6211 - val_acc: 0.0000e+00 Epoch 12/200 - 2s - loss: 227963.8670 - acc: 0.0000e+00 - val_loss: 228800.3780 - val_acc: 0.0000e+00 Epoch 13/200 - 2s - loss: 225788.3079 - acc: 0.0000e+00 - val_loss: 226632.0599 - val_acc: 0.0000e+00 Epoch 14/200 - 3s - loss: 223606.1505 - acc: 0.0000e+00 - val_loss: 223809.9382 - val_acc: 0.0000e+00 Epoch 15/200 - 3s - loss: 221416.7447 - acc: 0.0000e+00 - val_loss: 220388.9384 - val_acc: 0.0000e+00 Epoch 16/200 - 3s - loss: 219306.1292 - acc: 0.0000e+00 - val_loss: 216452.5520 - val_acc: 0.0000e+00 Epoch 17/200 - 3s - loss: 217239.6921 - acc: 0.0000e+00 - val_loss: 213895.7212 - val_acc: 0.0000e+00 Epoch 18/200 - 3s - loss: 215266.2501 - acc: 0.0000e+00 - val_loss: 211874.5399 - val_acc: 0.0000e+00 Epoch 19/200 - 3s - loss: 213426.6784 - acc: 0.0000e+00 - val_loss: 209590.2366 - val_acc: 2.8760e-04 Epoch 20/200 - 3s - loss: 211654.5152 - acc: 0.0000e+00 - val_loss: 206945.6682 - val_acc: 0.0000e+00 Epoch 21/200 - 3s - loss: 210006.4334 - acc: 0.0000e+00 - val_loss: 204139.0336 - val_acc: 0.0000e+00 Epoch 22/200 - 2s - loss: 208480.0654 - acc: 0.0000e+00 - val_loss: 202818.2083 - val_acc: 0.0000e+00 Epoch 23/200 - 3s - loss: 207095.6864 - acc: 0.0000e+00 - val_loss: 200138.3704 - val_acc: 0.0000e+00 Epoch 24/200 - 3s - loss: 205760.5758 - acc: 2.8760e-04 - val_loss: 198417.4672 - val_acc: 0.0000e+00 Epoch 25/200 - 2s - loss: 204575.2013 - acc: 0.0000e+00 - val_loss: 196761.1546 - val_acc: 0.0000e+00 Epoch 26/200 - 2s - loss: 203502.4613 - acc: 0.0000e+00 - val_loss: 195262.1544 - val_acc: 0.0000e+00 Epoch 27/200 - 2s - loss: 202528.4124 - acc: 0.0000e+00 - val_loss: 194745.8543 - val_acc: 0.0000e+00 Epoch 28/200 - 3s - loss: 201706.8706 - acc: 0.0000e+00 - val_loss: 193888.8870 - val_acc: 0.0000e+00 Epoch 29/200 - 3s - loss: 200873.6037 - acc: 0.0000e+00 - val_loss: 193091.4885 - val_acc: 0.0000e+00 Epoch 30/200 - 3s - loss: 200161.1630 - acc: 0.0000e+00 - val_loss: 193206.9390 - val_acc: 0.0000e+00 Epoch 31/200 - 3s - loss: 199572.3630 - acc: 0.0000e+00 - val_loss: 192240.0993 - val_acc: 0.0000e+00 Epoch 32/200 - 3s - loss: 199035.0737 - acc: 0.0000e+00 - val_loss: 191247.9004 - val_acc: 0.0000e+00 Epoch 33/200 - 2s - loss: 198521.9556 - acc: 0.0000e+00 - val_loss: 190547.4664 - val_acc: 0.0000e+00 Epoch 34/200 - 2s - loss: 198099.8821 - acc: 0.0000e+00 - val_loss: 189714.8254 - val_acc: 0.0000e+00 Epoch 35/200 - 3s - loss: 197701.3073 - acc: 0.0000e+00 - val_loss: 188941.1377 - val_acc: 0.0000e+00 Epoch 36/200 - 3s - loss: 197376.9153 - acc: 0.0000e+00 - val_loss: 188377.7941 - val_acc: 2.8760e-04 Epoch 37/200 - 3s - loss: 197044.9920 - acc: 0.0000e+00 - val_loss: 187045.7611 - val_acc: 0.0000e+00 Epoch 38/200 - 2s - loss: 196795.9062 - acc: 0.0000e+00 - val_loss: 187351.3920 - val_acc: 0.0000e+00 Epoch 39/200 - 2s - loss: 196572.3703 - acc: 0.0000e+00 - val_loss: 187124.7167 - val_acc: 0.0000e+00 Epoch 40/200 - 3s - loss: 196370.9548 - acc: 0.0000e+00 - val_loss: 186639.7503 - val_acc: 0.0000e+00 Epoch 41/200 - 3s - loss: 196227.8874 - acc: 0.0000e+00 - val_loss: 186408.3390 - val_acc: 0.0000e+00 Epoch 42/200 - 2s - loss: 196061.2889 - acc: 0.0000e+00 - val_loss: 186077.8087 - val_acc: 0.0000e+00 Epoch 43/200 - 2s - loss: 195928.0746 - acc: 0.0000e+00 - val_loss: 185542.2910 - val_acc: 0.0000e+00 Epoch 44/200 - 3s - loss: 195864.5363 - acc: 0.0000e+00 - val_loss: 185105.1968 - val_acc: 0.0000e+00 Epoch 45/200 - 3s - loss: 195753.8908 - acc: 0.0000e+00 - val_loss: 184844.2340 - val_acc: 0.0000e+00 Epoch 46/200 - 3s - loss: 195698.9140 - acc: 0.0000e+00 - val_loss: 184650.5630 - val_acc: 0.0000e+00 Epoch 47/200 - 3s - loss: 195616.3337 - acc: 0.0000e+00 - val_loss: 184245.7706 - val_acc: 0.0000e+00 Epoch 48/200 - 3s - loss: 195569.5281 - acc: 0.0000e+00 - val_loss: 184300.0953 - val_acc: 0.0000e+00 Epoch 49/200 - 3s - loss: 195521.6131 - acc: 0.0000e+00 - val_loss: 184786.1297 - val_acc: 0.0000e+00 Epoch 50/200 - 3s - loss: 195468.6009 - acc: 0.0000e+00 - val_loss: 185139.8302 - val_acc: 0.0000e+00 Epoch 51/200 - 3s - loss: 195454.3358 - acc: 0.0000e+00 - val_loss: 185025.3426 - val_acc: 0.0000e+00 Epoch 52/200 - 3s - loss: 195445.4346 - acc: 0.0000e+00 - val_loss: 184578.5638 - val_acc: 0.0000e+00 Epoch 53/200 - 3s - loss: 195396.9004 - acc: 0.0000e+00 - val_loss: 184158.2218 - val_acc: 0.0000e+00 Epoch 54/200 - 3s - loss: 195407.0587 - acc: 0.0000e+00 - val_loss: 184341.6772 - val_acc: 0.0000e+00 Epoch 55/200 - 3s - loss: 195399.8430 - acc: 0.0000e+00 - val_loss: 184529.7913 - val_acc: 0.0000e+00 Epoch 56/200 - 3s - loss: 195348.8731 - acc: 0.0000e+00 - val_loss: 184536.9332 - val_acc: 0.0000e+00 Epoch 57/200 - 3s - loss: 195358.3157 - acc: 0.0000e+00 - val_loss: 184368.8823 - val_acc: 0.0000e+00 Epoch 58/200 - 3s - loss: 195324.2318 - acc: 0.0000e+00 - val_loss: 183566.8680 - val_acc: 0.0000e+00 Epoch 59/200 - 3s - loss: 195324.9126 - acc: 0.0000e+00 - val_loss: 183373.8943 - val_acc: 0.0000e+00 Epoch 60/200 - 3s - loss: 195321.1821 - acc: 0.0000e+00 - val_loss: 183972.8905 - val_acc: 0.0000e+00 Epoch 61/200 - 3s - loss: 195354.8061 - acc: 0.0000e+00 - val_loss: 184067.9746 - val_acc: 0.0000e+00 Epoch 62/200 - 3s - loss: 195325.6220 - acc: 0.0000e+00 - val_loss: 183365.7582 - val_acc: 0.0000e+00 Epoch 63/200 - 2s - loss: 195325.1537 - acc: 0.0000e+00 - val_loss: 183448.1593 - val_acc: 0.0000e+00 Epoch 64/200 - 2s - loss: 195361.3605 - acc: 0.0000e+00 - val_loss: 184158.1261 - val_acc: 0.0000e+00 Epoch 65/200 - 2s - loss: 195327.4322 - acc: 0.0000e+00 - val_loss: 184253.0312 - val_acc: 0.0000e+00 Epoch 66/200 - 2s - loss: 195378.1483 - acc: 0.0000e+00 - val_loss: 184558.3813 - val_acc: 0.0000e+00 Epoch 67/200 - 2s - loss: 195374.7326 - acc: 0.0000e+00 - val_loss: 183383.6095 - val_acc: 0.0000e+00 Epoch 68/200 - 2s - loss: 195368.7838 - acc: 0.0000e+00 - val_loss: 183544.9586 - val_acc: 0.0000e+00 Epoch 69/200 - 3s - loss: 195334.8143 - acc: 0.0000e+00 - val_loss: 184147.1688 - val_acc: 0.0000e+00 Epoch 70/200 - 3s - loss: 195389.6477 - acc: 0.0000e+00 - val_loss: 184665.8255 - val_acc: 0.0000e+00 Epoch 71/200 - 2s - loss: 195363.5043 - acc: 0.0000e+00 - val_loss: 184763.5030 - val_acc: 0.0000e+00 Epoch 72/200 - 2s - loss: 195373.3171 - acc: 0.0000e+00 - val_loss: 184667.6454 - val_acc: 0.0000e+00 Epoch 73/200 - 2s - loss: 195379.7082 - acc: 0.0000e+00 - val_loss: 184943.3379 - val_acc: 0.0000e+00 Epoch 74/200 - 2s - loss: 195392.4984 - acc: 0.0000e+00 - val_loss: 184610.1339 - val_acc: 0.0000e+00 Epoch 75/200 - 2s - loss: 195357.4133 - acc: 0.0000e+00 - val_loss: 184848.9067 - val_acc: 0.0000e+00 Epoch 76/200 - 2s - loss: 195429.5762 - acc: 0.0000e+00 - val_loss: 185251.1268 - val_acc: 0.0000e+00 Epoch 77/200 - 2s - loss: 195427.6759 - acc: 0.0000e+00 - val_loss: 185617.7255 - val_acc: 0.0000e+00 Epoch 78/200 - 2s - loss: 195412.7640 - acc: 0.0000e+00 - val_loss: 185401.8593 - val_acc: 0.0000e+00 Epoch 79/200 - 3s - loss: 195408.7144 - acc: 0.0000e+00 - val_loss: 185220.5007 - val_acc: 0.0000e+00 Epoch 80/200 - 3s - loss: 195395.4735 - acc: 0.0000e+00 - val_loss: 185333.0465 - val_acc: 0.0000e+00 Epoch 81/200 - 3s - loss: 195431.5948 - acc: 0.0000e+00 - val_loss: 185281.1637 - val_acc: 0.0000e+00 Epoch 82/200 - 3s - loss: 195427.2975 - acc: 0.0000e+00 - val_loss: 185063.8881 - val_acc: 0.0000e+00 Epoch 83/200 - 2s - loss: 195429.2891 - acc: 0.0000e+00 - val_loss: 185142.5852 - val_acc: 0.0000e+00 Epoch 84/200 - 2s - loss: 195459.0527 - acc: 0.0000e+00 - val_loss: 184590.9984 - val_acc: 0.0000e+00 Epoch 85/200 - 2s - loss: 195448.4422 - acc: 0.0000e+00 - val_loss: 184182.4285 - val_acc: 0.0000e+00 Epoch 86/200 - 2s - loss: 195424.7728 - acc: 0.0000e+00 - val_loss: 184104.9516 - val_acc: 0.0000e+00 Epoch 87/200 - 2s - loss: 195400.6353 - acc: 0.0000e+00 - val_loss: 183871.4632 - val_acc: 0.0000e+00 Epoch 88/200 - 3s - loss: 195432.5526 - acc: 0.0000e+00 - val_loss: 183814.1315 - val_acc: 0.0000e+00 Epoch 89/200 - 3s - loss: 195453.0358 - acc: 2.8760e-04 - val_loss: 184666.0870 - val_acc: 0.0000e+00 Epoch 90/200 - 2s - loss: 195464.1884 - acc: 0.0000e+00 - val_loss: 184499.2254 - val_acc: 0.0000e+00 Epoch 91/200 - 2s - loss: 195448.0775 - acc: 0.0000e+00 - val_loss: 184666.1427 - val_acc: 0.0000e+00 Epoch 92/200 - 2s - loss: 195447.6317 - acc: 0.0000e+00 - val_loss: 185652.8650 - val_acc: 0.0000e+00 Epoch 93/200 - 3s - loss: 195436.8737 - acc: 0.0000e+00 - val_loss: 185552.3937 - val_acc: 0.0000e+00 Epoch 94/200 - 3s - loss: 195437.4609 - acc: 0.0000e+00 - val_loss: 185486.7145 - val_acc: 0.0000e+00 Epoch 95/200 - 2s - loss: 195483.3410 - acc: 0.0000e+00 - val_loss: 185325.4870 - val_acc: 0.0000e+00 Epoch 96/200 - 2s - loss: 195458.6826 - acc: 0.0000e+00 - val_loss: 184841.7381 - val_acc: 0.0000e+00 Epoch 97/200 - 2s - loss: 195438.1483 - acc: 0.0000e+00 - val_loss: 184822.1032 - val_acc: 0.0000e+00 Epoch 98/200 - 3s - loss: 195474.9970 - acc: 0.0000e+00 - val_loss: 184452.8845 - val_acc: 0.0000e+00 Epoch 99/200 - 3s - loss: 195479.1822 - acc: 0.0000e+00 - val_loss: 184494.5250 - val_acc: 0.0000e+00 Epoch 100/200 - 2s - loss: 195469.4960 - acc: 0.0000e+00 - val_loss: 184210.1910 - val_acc: 0.0000e+00 Epoch 101/200 - 2s - loss: 195463.7877 - acc: 0.0000e+00 - val_loss: 184259.6424 - val_acc: 0.0000e+00 Epoch 102/200 - 2s - loss: 195489.6614 - acc: 0.0000e+00 - val_loss: 184673.6306 - val_acc: 0.0000e+00 Epoch 103/200 - 2s - loss: 195468.4255 - acc: 0.0000e+00 - val_loss: 184837.2438 - val_acc: 0.0000e+00 Epoch 104/200 - 2s - loss: 195466.5730 - acc: 0.0000e+00 - val_loss: 184831.9443 - val_acc: 0.0000e+00 Epoch 105/200 - 2s - loss: 195461.7147 - acc: 0.0000e+00 - val_loss: 184811.8820 - val_acc: 0.0000e+00 Epoch 106/200 - 2s - loss: 195495.6439 - acc: 0.0000e+00 - val_loss: 184535.9261 - val_acc: 0.0000e+00 Epoch 107/200 - 3s - loss: 195478.5014 - acc: 0.0000e+00 - val_loss: 183850.6027 - val_acc: 0.0000e+00 Epoch 108/200 - 3s - loss: 195485.0055 - acc: 0.0000e+00 - val_loss: 184228.0488 - val_acc: 0.0000e+00 Epoch 109/200 - 3s - loss: 195490.1562 - acc: 0.0000e+00 - val_loss: 184049.4651 - val_acc: 0.0000e+00 Epoch 110/200 - 2s - loss: 195463.5472 - acc: 0.0000e+00 - val_loss: 183979.9637 - val_acc: 0.0000e+00 Epoch 111/200 - 3s - loss: 195466.2002 - acc: 0.0000e+00 - val_loss: 183922.0163 - val_acc: 0.0000e+00 Epoch 112/200 - 2s - loss: 195494.2449 - acc: 0.0000e+00 - val_loss: 184098.9795 - val_acc: 0.0000e+00 Epoch 113/200 - 2s - loss: 195467.4129 - acc: 2.8760e-04 - val_loss: 183923.9146 - val_acc: 0.0000e+00 Epoch 114/200 - 3s - loss: 195478.9303 - acc: 0.0000e+00 - val_loss: 184233.8636 - val_acc: 0.0000e+00 Epoch 115/200 - 2s - loss: 195492.8755 - acc: 0.0000e+00 - val_loss: 183678.9782 - val_acc: 0.0000e+00 Epoch 116/200 - 3s - loss: 195476.1045 - acc: 0.0000e+00 - val_loss: 183999.5662 - val_acc: 0.0000e+00 Epoch 117/200 - 3s - loss: 195475.0067 - acc: 0.0000e+00 - val_loss: 183750.6926 - val_acc: 0.0000e+00 Epoch 118/200 - 2s - loss: 195444.1771 - acc: 0.0000e+00 - val_loss: 183531.4881 - val_acc: 0.0000e+00 Epoch 119/200 - 3s - loss: 195456.0990 - acc: 0.0000e+00 - val_loss: 183617.5793 - val_acc: 0.0000e+00 Epoch 120/200 - 2s - loss: 195466.2133 - acc: 0.0000e+00 - val_loss: 183802.5115 - val_acc: 0.0000e+00 Epoch 121/200 - 3s - loss: 195437.4059 - acc: 0.0000e+00 - val_loss: 183594.7202 - val_acc: 0.0000e+00 Epoch 122/200 - 3s - loss: 195479.0137 - acc: 0.0000e+00 - val_loss: 183594.9440 - val_acc: 0.0000e+00 Epoch 123/200 - 2s - loss: 195445.1567 - acc: 0.0000e+00 - val_loss: 183830.1620 - val_acc: 0.0000e+00 Epoch 124/200 - 2s - loss: 195485.2744 - acc: 0.0000e+00 - val_loss: 183234.6731 - val_acc: 0.0000e+00 Epoch 125/200 - 2s - loss: 195475.8863 - acc: 0.0000e+00 - val_loss: 183765.6606 - val_acc: 0.0000e+00 Epoch 126/200 - 2s - loss: 195461.2404 - acc: 2.8760e-04 - val_loss: 183565.6058 - val_acc: 0.0000e+00 Epoch 127/200 - 2s - loss: 195482.5538 - acc: 0.0000e+00 - val_loss: 183641.1930 - val_acc: 0.0000e+00 Epoch 128/200 - 2s - loss: 195472.1918 - acc: 0.0000e+00 - val_loss: 183727.4728 - val_acc: 0.0000e+00 Epoch 129/200 - 2s - loss: 195476.9520 - acc: 0.0000e+00 - val_loss: 183403.4754 - val_acc: 0.0000e+00 Epoch 130/200 - 2s - loss: 195470.1123 - acc: 0.0000e+00 - val_loss: 183650.5799 - val_acc: 0.0000e+00 Epoch 131/200 - 2s - loss: 195467.0799 - acc: 0.0000e+00 - val_loss: 183422.5880 - val_acc: 0.0000e+00 Epoch 132/200 - 2s - loss: 195477.9288 - acc: 0.0000e+00 - val_loss: 183731.2103 - val_acc: 0.0000e+00 Epoch 133/200 - 2s - loss: 195453.9476 - acc: 0.0000e+00 - val_loss: 183664.7018 - val_acc: 0.0000e+00 Epoch 134/200 - 2s - loss: 195459.7911 - acc: 2.8760e-04 - val_loss: 183258.9522 - val_acc: 0.0000e+00 Epoch 135/200 - 3s - loss: 195477.6894 - acc: 0.0000e+00 - val_loss: 183250.0353 - val_acc: 0.0000e+00 Epoch 136/200 - 2s - loss: 195470.4907 - acc: 2.8760e-04 - val_loss: 183325.1750 - val_acc: 0.0000e+00 Epoch 137/200 - 2s - loss: 195457.6549 - acc: 0.0000e+00 - val_loss: 183128.0413 - val_acc: 0.0000e+00 Epoch 138/200 - 2s - loss: 195451.7374 - acc: 2.8760e-04 - val_loss: 183095.3022 - val_acc: 0.0000e+00 Epoch 139/200 - 2s - loss: 195430.9589 - acc: 0.0000e+00 - val_loss: 183068.1891 - val_acc: 0.0000e+00 Epoch 140/200 - 2s - loss: 195472.4346 - acc: 0.0000e+00 - val_loss: 183043.8016 - val_acc: 0.0000e+00 Epoch 141/200 - 3s - loss: 195474.6245 - acc: 0.0000e+00 - val_loss: 183072.5931 - val_acc: 0.0000e+00 Epoch 142/200 - 2s - loss: 195439.9469 - acc: 0.0000e+00 - val_loss: 183135.6248 - val_acc: 0.0000e+00 Epoch 143/200 - 2s - loss: 195441.8985 - acc: 0.0000e+00 - val_loss: 183021.7210 - val_acc: 0.0000e+00 Epoch 144/200 - 2s - loss: 195442.6170 - acc: 0.0000e+00 - val_loss: 182893.0097 - val_acc: 0.0000e+00 Epoch 145/200 - 2s - loss: 195439.8753 - acc: 0.0000e+00 - val_loss: 182915.6292 - val_acc: 0.0000e+00 Epoch 146/200 - 2s - loss: 195436.0277 - acc: 0.0000e+00 - val_loss: 182777.1501 - val_acc: 0.0000e+00 Epoch 147/200 - 2s - loss: 195434.7522 - acc: 0.0000e+00 - val_loss: 182753.7661 - val_acc: 0.0000e+00 Epoch 148/200 - 2s - loss: 195457.6565 - acc: 0.0000e+00 - val_loss: 182612.4348 - val_acc: 0.0000e+00 Epoch 149/200 - 2s - loss: 195495.7157 - acc: 0.0000e+00 - val_loss: 182550.5300 - val_acc: 0.0000e+00 Epoch 150/200 - 2s - loss: 195449.4474 - acc: 0.0000e+00 - val_loss: 182546.5075 - val_acc: 0.0000e+00 Epoch 151/200 - 2s - loss: 195457.4294 - acc: 0.0000e+00 - val_loss: 181868.5405 - val_acc: 0.0000e+00 Epoch 152/200 - 2s - loss: 195460.9698 - acc: 2.8760e-04 - val_loss: 182062.7935 - val_acc: 0.0000e+00 Epoch 153/200 - 2s - loss: 195457.5152 - acc: 0.0000e+00 - val_loss: 181863.8373 - val_acc: 0.0000e+00 Epoch 154/200 - 2s - loss: 195466.1971 - acc: 0.0000e+00 - val_loss: 182173.9323 - val_acc: 0.0000e+00 Epoch 155/200 - 2s - loss: 195444.8498 - acc: 0.0000e+00 - val_loss: 181869.6481 - val_acc: 0.0000e+00 Epoch 156/200 - 2s - loss: 195453.1903 - acc: 0.0000e+00 - val_loss: 181605.8907 - val_acc: 0.0000e+00 Epoch 157/200 - 2s - loss: 195489.6974 - acc: 0.0000e+00 - val_loss: 182259.6924 - val_acc: 0.0000e+00 Epoch 158/200 - 2s - loss: 195466.0687 - acc: 0.0000e+00 - val_loss: 182189.6657 - val_acc: 0.0000e+00 Epoch 159/200 - 2s - loss: 195485.9728 - acc: 0.0000e+00 - val_loss: 182403.9244 - val_acc: 0.0000e+00 Epoch 160/200 - 3s - loss: 195506.8130 - acc: 0.0000e+00 - val_loss: 182244.1800 - val_acc: 0.0000e+00 Epoch 161/200 - 3s - loss: 195480.0568 - acc: 0.0000e+00 - val_loss: 182473.5691 - val_acc: 0.0000e+00 Epoch 162/200 - 2s - loss: 195496.4917 - acc: 0.0000e+00 - val_loss: 182636.2321 - val_acc: 0.0000e+00 Epoch 163/200 - 2s - loss: 195471.3256 - acc: 0.0000e+00 - val_loss: 182680.6537 - val_acc: 2.8760e-04 Epoch 164/200 - 2s - loss: 195473.1113 - acc: 0.0000e+00 - val_loss: 182696.0740 - val_acc: 0.0000e+00 Epoch 165/200 - 2s - loss: 195477.8964 - acc: 0.0000e+00 - val_loss: 182575.0914 - val_acc: 2.8760e-04 Epoch 166/200 - 2s - loss: 195487.1710 - acc: 0.0000e+00 - val_loss: 182742.1346 - val_acc: 0.0000e+00 Epoch 167/200 - 2s - loss: 195460.1076 - acc: 0.0000e+00 - val_loss: 182596.5257 - val_acc: 0.0000e+00 Epoch 168/200 - 3s - loss: 195465.9434 - acc: 0.0000e+00 - val_loss: 182656.0113 - val_acc: 0.0000e+00 Epoch 169/200 - 2s - loss: 195463.6720 - acc: 2.8760e-04 - val_loss: 181938.2320 - val_acc: 0.0000e+00 Epoch 170/200 - 2s - loss: 195480.6142 - acc: 0.0000e+00 - val_loss: 181441.6512 - val_acc: 0.0000e+00 Epoch 171/200 - 2s - loss: 195475.3307 - acc: 0.0000e+00 - val_loss: 181792.5836 - val_acc: 0.0000e+00 Epoch 172/200 - 2s - loss: 195456.1476 - acc: 0.0000e+00 - val_loss: 182147.3755 - val_acc: 0.0000e+00 Epoch 173/200 - 2s - loss: 195475.8040 - acc: 0.0000e+00 - val_loss: 182177.5330 - val_acc: 0.0000e+00 Epoch 174/200 - 2s - loss: 195479.9504 - acc: 0.0000e+00 - val_loss: 182208.6537 - val_acc: 0.0000e+00 Epoch 175/200 - 2s - loss: 195459.6837 - acc: 0.0000e+00 - val_loss: 182122.0784 - val_acc: 0.0000e+00 Epoch 176/200 - 2s - loss: 195467.3283 - acc: 2.8760e-04 - val_loss: 181848.8619 - val_acc: 0.0000e+00 Epoch 177/200 - 2s - loss: 195468.4745 - acc: 0.0000e+00 - val_loss: 181862.0922 - val_acc: 0.0000e+00 Epoch 178/200 - 2s - loss: 195471.8972 - acc: 0.0000e+00 - val_loss: 181886.4019 - val_acc: 0.0000e+00 Epoch 179/200 - 2s - loss: 195460.1161 - acc: 0.0000e+00 - val_loss: 181814.8222 - val_acc: 0.0000e+00 Epoch 180/200 - 2s - loss: 195499.5465 - acc: 0.0000e+00 - val_loss: 181819.5692 - val_acc: 0.0000e+00 Epoch 181/200 - 2s - loss: 195454.7540 - acc: 0.0000e+00 - val_loss: 181887.8555 - val_acc: 0.0000e+00 Epoch 182/200 - 2s - loss: 195487.3822 - acc: 0.0000e+00 - val_loss: 181821.7993 - val_acc: 0.0000e+00 Epoch 183/200 - 2s - loss: 195455.0825 - acc: 0.0000e+00 - val_loss: 181856.4182 - val_acc: 0.0000e+00 Epoch 184/200 - 2s - loss: 195496.6770 - acc: 0.0000e+00 - val_loss: 181768.0757 - val_acc: 0.0000e+00 Epoch 185/200 - 2s - loss: 195461.5927 - acc: 0.0000e+00 - val_loss: 181683.1072 - val_acc: 0.0000e+00 Epoch 186/200 - 2s - loss: 195495.0217 - acc: 0.0000e+00 - val_loss: 181700.0158 - val_acc: 0.0000e+00 Epoch 187/200 - 2s - loss: 195445.7431 - acc: 0.0000e+00 - val_loss: 181766.8667 - val_acc: 0.0000e+00 Epoch 188/200 - 2s - loss: 195463.6909 - acc: 0.0000e+00 - val_loss: 181982.3083 - val_acc: 0.0000e+00 Epoch 189/200 - 2s - loss: 195477.2432 - acc: 0.0000e+00 - val_loss: 181940.3596 - val_acc: 0.0000e+00 Epoch 190/200 - 2s - loss: 195464.1343 - acc: 0.0000e+00 - val_loss: 181823.8806 - val_acc: 0.0000e+00 Epoch 191/200 - 2s - loss: 195452.2148 - acc: 0.0000e+00 - val_loss: 181949.4636 - val_acc: 0.0000e+00 Epoch 192/200 - 2s - loss: 195469.9830 - acc: 0.0000e+00 - val_loss: 181792.7081 - val_acc: 0.0000e+00 Epoch 193/200 - 2s - loss: 195459.5354 - acc: 0.0000e+00 - val_loss: 181454.7131 - val_acc: 0.0000e+00 Epoch 194/200 - 3s - loss: 195448.5514 - acc: 0.0000e+00 - val_loss: 181652.3696 - val_acc: 0.0000e+00 Epoch 195/200 - 2s - loss: 195465.3238 - acc: 0.0000e+00 - val_loss: 181647.2812 - val_acc: 0.0000e+00 Epoch 196/200 - 2s - loss: 195464.8206 - acc: 0.0000e+00 - val_loss: 181621.8350 - val_acc: 0.0000e+00 Epoch 197/200 - 2s - loss: 195439.5109 - acc: 0.0000e+00 - val_loss: 181538.7109 - val_acc: 0.0000e+00 Epoch 198/200 - 2s - loss: 195468.4378 - acc: 0.0000e+00 - val_loss: 181387.1260 - val_acc: 0.0000e+00 Epoch 199/200 - 2s - loss: 195460.6704 - acc: 0.0000e+00 - val_loss: 181443.1030 - val_acc: 0.0000e+00 Epoch 200/200 - 2s - loss: 195466.0657 - acc: 0.0000e+00 - val_loss: 181374.3545 - val_acc: 0.0000e+00
# plot history
plt.plot(history.history['loss'], label='train loss')
plt.plot(history.history['val_loss'], label='test val_loss')
plt.legend()
plt.show()
y_pred_lstm_imputed = model.predict(X_test)
plt.plot(Y_test.reshape(-1,1))
plt.plot(y_pred_lstm_imputed)
[<matplotlib.lines.Line2D at 0x23b452ded30>]
plt.plot(Y_test.reshape(-1,1))
plt.plot(y_pred_lstm_imputed)
plt.title('LSTM predicted vs actual values')
#plt.xlabel('Actual Price')
#plt.ylabel('Predicted Price')
#plt.legend(Y_test,y_pred_lstm_imputed)
plt.show()
print(Y_test.shape,X_test.shape,y_pred_lstm_imputed.shape)
(71, 1) (71, 1, 10) (71, 1)
Check and updated shape of X_test. Scale X_train and x_test on correct shape of dataframe.
#Inverse transform the prediction and Y_test
X_test = X_test.reshape((X_test.shape[0],X_test.shape[2]))
# Concatenate with X_test to reshape before inverse transform
y_pred_lstm_imputed = concatenate((y_pred_lstm_imputed,X_test),axis=1)
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)
#Invert scaling for forecast
y_pred_lstm_imputed = scaler_df_imputed.inverse_transform(y_pred_lstm_imputed)
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)
#slice reverse transformed y_pred values to compare with Y_test actual values
y_pred_lstm_imputed = y_pred_lstm_imputed[:,0:1]
(71, 10) (71, 11) (71, 1) (71, 10) (71, 11) (71, 1)
#Reconsturct Y_test with X_test and inverse transform it.
Y_test = concatenate((Y_test,X_test),axis=1)
Y_test = scaler_df_imputed.inverse_transform(Y_test)
#slice reverse transformed y_pred values to compare with Y_test actual values
Y_test = Y_test[:,0:1]
print(X_test.shape,y_pred_lstm_imputed.shape,Y_test.shape)
(71, 10) (71, 1) (71, 1)
#Calculate MSE between actual values Y_test and inverse transformed y_pred
mse = mean_squared_error(Y_test,y_pred_lstm_imputed)
print('Means Square Error between Y_test adn prediction values is :', mse)
Means Square Error between Y_test adn prediction values is : 1307257985747.0803
print('predicted- {} ; Actual - {}'.format(y_pred_lstm_imputed[-1], Y_test[-1]))
predicted- [143851.07772796] ; Actual - [1460579.96104185]
r2score = r2_score(Y_test,y_pred_lstm_imputed)
print('R2 score between Y_test adn prediction value is :', r2score)
R2 score between Y_test adn prediction value is : -60.31341629150638
model.summary()
_________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm_1 (LSTM) (None, 512) 1071104 _________________________________________________________________ dropout_1 (Dropout) (None, 512) 0 _________________________________________________________________ batch_normalization_1 (Batch (None, 512) 2048 _________________________________________________________________ dense_1 (Dense) (None, 1) 513 ================================================================= Total params: 1,073,665 Trainable params: 1,072,641 Non-trainable params: 1,024 _________________________________________________________________
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
#Convert Date index to column
#df_imputed.reset_index(inplace=True)
Best observed high variance features Vs target label 'Value_SP500_REAL_PRICE_MONTH' to understand linear corelation.
This is to find colinearity between features and the prediction line for selected set of features. Predicted line would try to fit with linear regression model through the feature data points.
The dataset here is very imbalanced and non linear which makes it different to predict a linear line with optimum coefficients and intercept.
#5 features with highest variance.
max_var_cols1 = Print_PCAfeatures_graph(df_imputed,a,-10)
# visualize relationship between the features and the target price using scatterplots
sns.pairplot(df_imputed, x_vars=max_var_cols1,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
<seaborn.axisgrid.PairGrid at 0x23b41f41390>
#5 feature with highest PCA variance.
max_var_cols2 = Print_PCAfeatures_graph(df_interpolate,b,-10)
print(max_var_cols2)
# visualize relationship between the features and the target price using seaborn scatterplot.
#High variance features Vs target label
sns.pairplot(df_interpolate, x_vars=max_var_cols2,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_EARNINGS_YIELD_MONTH', 'Value_SP500_PBV_RATIO_YEAR', 'Value_SP500_PE_RATIO_MONTH', 'Value_SP500_PBV_RATIO_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_SALES_YEAR', 'Value_SHILLER_PE_RATIO_YEAR', 'Value_SP500_EARNINGS_YEAR', 'Value_SP500_DIV_YIELD_MONTH', 'Value_SP500_REAL_SALES_GROWTH_YEAR']
<seaborn.axisgrid.PairGrid at 0x23b48418c50>
#5 feature with highest PCA variance.
max_var_cols3 = Print_PCAfeatures_graph(df_interpolate,b,10)
print(max_var_cols3)
# visualize relationship between the features and the target price using seaborn scatterplot.
#High variance features Vs target label
sns.pairplot(df_interpolate, x_vars=max_var_cols3,y_vars=['Value_SP500_REAL_PRICE_MONTH'], height=10, aspect=0.5,kind='reg')
<Figure size 3000x2000 with 0 Axes>
['Value_SP500_DIV_GROWTH_YEAR', 'Value_SP500_DIV_YEAR', 'Value_SP500_REAL_SALES_GROWTH_QUARTER', 'Value_SP500_SALES_YEAR', 'Value_SP500_SALES_QUARTER', 'Value_SP500_SALES_GROWTH_QUARTER', 'Value_SP500_EARNINGS_GROWTH_QUARTER', 'Value_SP500_REAL_EARNINGS_GROWTH_YEAR', 'Value_SHILLER_PE_RATIO_MONTH', 'Value_SP500_PSR_YEAR']
<seaborn.axisgrid.PairGrid at 0x23b4d9d5780>
#Split data into train and test
#X_train, Y_train, X_test, Y_test = Create_Training_Test_Dataset(df_imputed,0.8,Linear_regr='True')
scaler = StandardScaler()
scaled_interpolated = scaler.fit_transform(df_PCA_features.values)
df_scaled_interpolated = pd.DataFrame(scaled_interpolated,index=df_PCA_features.index,columns = df_PCA_features.columns)
# Split the size into 80% and 20% row-wise
split_percent = 0.8
train_size = int(len(df_PCA_features) * split_percent)
test_size = len(df_PCA_features) - train_size
#print(train_size,test_size)
print('Training and Test dataset is of size {} & {}'.format(train_size,test_size))
#Slice the df into train and test df.
train = df_PCA_features.iloc[0:train_size,:]
test = df_PCA_features.iloc[train_size:len(df),:]
print(train.shape, test.shape)
Training and Test dataset is of size 2838 & 710 (2838, 11) (710, 11)
# #Slice the df into train and test df.
# train = df_scaled_interpolated.iloc[0:train_size,:]
# test = df_scaled_interpolated.iloc[train_size:len(df),:]
# print(train.shape, test.shape)
# Slice Train dataset.
temp_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
X_train = train.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
Y_train = train['Value_SP500_REAL_PRICE_MONTH']
print('Features size of X_train and training target Y_train shape is {} & {}'.format(X_train.shape,Y_train.shape))
#Test dataset
X_test = test.drop(['Value_SP500_REAL_PRICE_MONTH'],axis=1,inplace=False)
Y_test = test['Value_SP500_REAL_PRICE_MONTH']
print('Features size of X_test and Test target Y_test shape is {} & {}'.format(X_test.shape,Y_test.shape))
Features size of X_train and training target Y_train shape is (2838, 10) & (2838,) Features size of X_test and Test target Y_test shape is (710, 10) & (710,)
#np.array(temp_train[:]).reshape(-1,1)
#print((np.array(temp_train[:]).reshape(-1,2)).shape)
# Instantiate linear regression object
regr = linear_model.LinearRegression()
# Train the model using the training sets
regr.fit(X_train, Y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
# print the intercept and coefficients
print(regr.intercept_)
print(regr.coef_)
137.11463021015638 [-3.89875853e+00 -5.67269650e-01 -5.46924799e+00 -6.41997252e+00 1.29380410e-01 2.46530686e+00 0.00000000e+00 -3.98730436e-02 1.55754077e-04 1.55754077e-04]
#Linear regression prediction on testing set
y_pred_inter_regr = regr.predict(X_test)
print(y_pred_inter_regr.shape,X_test.shape,Y_test.shape)
(710,) (710, 10) (710,)
feature_cols = X_train.columns
# pair the feature names with the coefficients
print('Coefficients of all feature variables in training dataset are: ')
list(zip(feature_cols, regr.coef_))
Coefficients of all feature variables in training dataset are:
[('Value_SP500_DIV_YIELD_MONTH', -3.8987585275669585),
('Value_SP500_PE_RATIO_MONTH', -0.5672696500428746),
('Value_SHILLER_PE_RATIO_MONTH', -5.469247991877379),
('Value_SP500_EARNINGS_YIELD_MONTH', -6.41997252365831),
('Value_SP500_INFLADJ_MONTH', 0.12938040986247704),
('Value_SP500_EARNINGS_MONTH', 2.465306856497017),
('Value_SP500_PSR_QUARTER', 0.0),
('Value_SP500_SALES_QUARTER', -0.03987304360362631),
('Value_SP500_REAL_SALES_GROWTH_QUARTER', 0.00015575407657666528),
('Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 0.00015575407657666528)]
# The coefficients
#print('Coefficients: \n', regr.coef_)
# The mean squared error: is always non-negative, and values closer to zero are better.
print('Mean squared error: {}'.format(mean_squared_error(Y_test, y_pred_inter_regr)))
# The Root mean squared error: RMSE is popular than MSE, because RMSE is interpretable in the "y" units.
print('Root Mean squared error: {}'.format(np.sqrt(mean_squared_error(Y_test, y_pred_inter_regr))))
# Explained variance score: 1 is perfect prediction
print('Variance R2 score: {}'.format(r2_score(Y_test, y_pred_inter_regr)))
print('intercept: ',regr.intercept_)
print('Score : ',regr.score)
Mean squared error: 1076172.6990871658
Root Mean squared error: 1037.3874392372243
Variance R2 score: -1.9382920775190278
intercept: 137.11463021015638
Score : <bound method RegressorMixin.score of LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)>
#Reshape prediction and test set to concatenate with X_test to get origninal form of test dataset.
y_pred_inter_regr = y_pred_inter_regr[:,np.newaxis]
Y_test = Y_test[:,np.newaxis]
print(X_train.shape,X_test.shape,y_pred_inter_regr.shape,Y_test.shape)
y_pred_inter_regr = concatenate((y_pred_inter_regr, X_test), axis=1)
print(X_test.shape,y_pred_inter_regr.shape,Y_test.shape)
(2838, 10) (710, 10) (710, 1) (710, 1) (710, 10) (710, 11) (710, 1)
# # Plot outputs
# for col in feature_cols:
# plt.scatter(X_test[col], Y_test)
# #plt.plot(X_test[col], y_pred_imputed_regr)
# #print(col)
# plt.xticks(())
# plt.yticks(())
# plt.show()
feature_cols = X_train.columns
# pair the feature names with the coefficients
print('Coefficients of all feature variables in training dataset are: ')
list(zip(feature_cols, regr.coef_))
Coefficients of all feature variables in training dataset are:
[('Value_SP500_DIV_YIELD_MONTH', -3.8987585275669585),
('Value_SP500_PE_RATIO_MONTH', -0.5672696500428746),
('Value_SHILLER_PE_RATIO_MONTH', -5.469247991877379),
('Value_SP500_EARNINGS_YIELD_MONTH', -6.41997252365831),
('Value_SP500_INFLADJ_MONTH', 0.12938040986247704),
('Value_SP500_EARNINGS_MONTH', 2.465306856497017),
('Value_SP500_PSR_QUARTER', 0.0),
('Value_SP500_SALES_QUARTER', -0.03987304360362631),
('Value_SP500_REAL_SALES_GROWTH_QUARTER', 0.00015575407657666528),
('Value_SP500_REAL_EARNINGS_GROWTH_QUARTER', 0.00015575407657666528)]
MSE, RMSE and R2 score are very bad with all features and PCA selected features.
from fbprophet import Prophet
# plt.style.available
plt.style.use("seaborn-whitegrid")
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import statsmodels.api as sm
from scipy import stats
#Reset index of df to make it a column as prerequisite for fbProphet.
df_fb = df_imputed.reset_index()
df_interpolate_fb = df_interpolate.reset_index()
df_interpolate_fb.tail() #for further use.
#Change dtype of Date column to datetime64 for fbProphet
df_interpolate_fb.loc[:,'Date'] = pd.to_datetime(df_interpolate_fb.loc[:,'Date'],format = '%Y%m%d')
#Plotting of S&P interpolated price vs date
# First Subplot
f, (ax1, ax2) = plt.subplots(1, 2, figsize=(14,5))
ax1.plot(df_interpolate_fb['Date'], df_interpolate_fb['Value_SP500_REAL_PRICE_MONTH'],color ='blue')
#ax1.plot(df_fb['Date'], df_fb['Value_SP500_REAL_PRICE_MONTH'],color='red')
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel("Real Price")
ax1.set_title("S&P 500 Index price History")
# #Second Subplot- original plot
# ax1.plot(df_fb['Date'], df_fb['Value_SP500_REAL_PRICE_MONTH'],color='red')
# ax1.set_xlabel("Date", fontsize=12)
# ax1.set_ylabel("Original S&P 500 Real Index price")
# ax1.set_title("S&P 500 Index price History")
#Third subplot
ax2.plot(df_interpolate_fb['Date'], df_interpolate_fb['Value_SP500_REAL_SALES_GROWTH_YEAR'],color='orange')
ax2.set_xlabel("Date", fontsize=12)
ax2.set_ylabel("SP500_REAL_SALES_GROWTH_YEAR")
ax2.set_title("SP500_REAL_SALES_GROWTH_YEAR")
plt.show()
#Data preparation for fbprophet model. model takes date/timeseries and target label as input for fit and predict
df_interpolate_fb = df_interpolate_fb[['Date','Value_SP500_REAL_PRICE_MONTH']]
#Rename column names to 'ds' and 'y' as required by fbprophet model.
df_interpolate_fb.rename(columns={'Date':'ds','Value_SP500_REAL_PRICE_MONTH' : 'y'},inplace=True)
print(df_interpolate_fb.info(),df_interpolate_fb.shape)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3548 entries, 0 to 3547 Data columns (total 2 columns): ds 3548 non-null datetime64[ns] y 3548 non-null float64 dtypes: datetime64[ns](1), float64(1) memory usage: 55.5 KB None (3548, 2)
fb = Prophet()
fb.fit(df_interpolate_fb)
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this. C:\Users\Akshat\Anaconda3\lib\site-packages\pystan\misc.py:399: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
<fbprophet.forecaster.Prophet at 0x23b4f130048>
#Create future dates
future_dates = fb.make_future_dataframe(periods=30)
#Predict prices for future dates
future_price = fb.predict(future_dates)
future_price[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
| ds | yhat | yhat_lower | yhat_upper | |
|---|---|---|---|---|
| 3573 | 2019-01-26 | 2104.546587 | 1953.286430 | 2264.258040 |
| 3574 | 2019-01-27 | 2109.010824 | 1955.294901 | 2260.299157 |
| 3575 | 2019-01-28 | 2109.505319 | 1957.157929 | 2260.275119 |
| 3576 | 2019-01-29 | 2111.732478 | 1960.086347 | 2277.521839 |
| 3577 | 2019-01-30 | 2115.552914 | 1960.619623 | 2279.389418 |
import matplotlib.dates as mdates
# Dates
starting_date = dt.datetime(2018, 11, 30)
starting_date1 = mdates.date2num(starting_date)
trend_date = dt.datetime(2019, 1, 1)
trend_date1 = mdates.date2num(trend_date)
pointing_arrow = dt.datetime(2019, 1, 1)
pointing_arrow1 = mdates.date2num(pointing_arrow)
# Plot.
fig = fb.plot(future_price)
ax1 = fig.add_subplot(111)
ax1.set_title("S&P500 Index Price Forecast", fontsize=16)
ax1.set_xlabel("Date", fontsize=12)
ax1.set_ylabel("Real Price", fontsize=12)
# Forecast initialization arrow
ax1.annotate('Forecast \n Initialization', xy=(pointing_arrow1, 2100), xytext=(starting_date1,2500),
arrowprops=dict(facecolor='#ff7f50', shrink=0.1),
)
# Trend emphasis arrow
ax1.annotate('Upward Trend', xy=(trend_date1, 2108), xytext=(trend_date1,950),
arrowprops=dict(facecolor='#6cff6c', shrink=0.1),
)
ax1.axhline(y=1260, color='b', linestyle='-')
plt.show()
C:\Users\Akshat\Anaconda3\lib\site-packages\matplotlib\cbook\deprecation.py:107: MatplotlibDeprecationWarning: Adding an axes using the same arguments as a previous axes currently reuses the earlier instance. In a future version, a new instance will always be created and returned. Meanwhile, this warning can be suppressed, and the future behavior ensured, by passing a unique label to each axes instance.
fig2 = fb.plot_components(future_price)
plt.show()
# Monthly Data Predictions
fbm = Prophet(changepoint_prior_scale=0.01).fit(df_interpolate_fb)
future = fbm.make_future_dataframe(periods=12, freq='M')
fcst = fbm.predict(future)
fig = fbm.plot(fcst)
plt.title("Monthly Prediction \n 1 year time frame")
plt.show()
INFO:fbprophet.forecaster:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.
fig3 = fbm.plot_components(fcst)
plt.show()